Whamcloud - gitweb
Merge "LU-9121 lnet: User Defined Selection Policy (UDSP)"
authorGerrit Code Review <gerrit-review@whamcloud.com>
Fri, 26 Feb 2021 07:21:34 +0000 (07:21 +0000)
committerGerrit Code Review <gerrit-review@whamcloud.com>
Fri, 26 Feb 2021 07:21:34 +0000 (07:21 +0000)
23 files changed:
libcfs/include/libcfs/util/string.h
libcfs/libcfs/util/nidstrings.c
lnet/include/lnet/Makefile.am
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/include/lnet/udsp.h [new file with mode: 0644]
lnet/include/uapi/linux/lnet/libcfs_ioctl.h
lnet/include/uapi/linux/lnet/lnet-dlc.h
lnet/include/uapi/linux/lnet/lnet-types.h
lnet/include/uapi/linux/lnet/nidstr.h
lnet/lnet/Makefile.in
lnet/lnet/api-ni.c
lnet/lnet/config.c
lnet/lnet/lib-move.c
lnet/lnet/nidstrings.c
lnet/lnet/peer.c
lnet/lnet/udsp.c [new file with mode: 0644]
lnet/utils/lnetconfig/Makefile.am
lnet/utils/lnetconfig/liblnetconfig.c
lnet/utils/lnetconfig/liblnetconfig.h
lnet/utils/lnetconfig/liblnetconfig_udsp.c [new file with mode: 0644]
lnet/utils/lnetctl.c
lustre/doc/lnetctl.8

index f73fd79..1b6e819 100644 (file)
@@ -106,6 +106,7 @@ int cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, __u32 **valpp
 int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res);
 int cfs_str2num_check(char *str, int nob, unsigned *num,
                      unsigned min, unsigned max);
+int cfs_expr2str(struct list_head *list, char *str, size_t size);
 int cfs_expr_list_match(__u32 value, struct cfs_expr_list *expr_list);
 int cfs_expr_list_print(char *buffer, int count,
                        struct cfs_expr_list *expr_list);
index f9792c3..7d13e47 100644 (file)
@@ -189,6 +189,77 @@ out:
        return rc;
 }
 
+int
+cfs_expr2str(struct list_head *list, char *str, size_t size)
+{
+       struct cfs_expr_list *expr;
+       struct cfs_range_expr *range;
+       char tmp[LNET_NIDSTR_SIZE];
+       size_t len;
+       bool first;
+       bool bracket = false;
+       char *format;
+       char *tmpc;
+
+       list_for_each_entry(expr, list, el_link) {
+               first = true;
+               list_for_each_entry(range, &expr->el_exprs, re_link) {
+                       if (range->re_lo == range->re_hi) {
+                               snprintf(tmp,
+                                        LNET_NIDSTR_SIZE,
+                                        "%u.", range->re_lo);
+                       } else if (range->re_lo < range->re_hi) {
+                               if (range->re_stride > 1) {
+                                       if (first)
+                                               format = "[%u-%u/%u,";
+                                       else
+                                               format = "%u-%u/%u,";
+                                       snprintf(tmp, LNET_NIDSTR_SIZE,
+                                               format, range->re_lo,
+                                               range->re_hi, range->re_stride);
+                                       bracket = true;
+                               } else {
+                                       if (first)
+                                               format = "[%u-%u,";
+                                       else
+                                               format = "%u-%u,";
+                                       snprintf(tmp, LNET_NIDSTR_SIZE,
+                                               format, range->re_lo,
+                                               range->re_hi);
+                                       bracket = true;
+                               }
+                       } else {
+                               return -EINVAL;
+                       }
+                       len = strlen(tmp);
+                       size -= (len + 1);
+                       if (size < 0)
+                               return -ENOBUFS;
+
+                       strncat(str, tmp, size + len);
+                       first = false;
+               }
+               if (bracket) {
+                       tmpc = str + (strlen(str) - 1);
+                       size -= 1;
+                       if (size < 0)
+                               return -ENOBUFS;
+                       *tmpc = ']';
+                       *(tmpc+1) = '.';
+                       bracket = false;
+               }
+       }
+
+       /*
+        * get rid of the trailing '.' at the end of the string
+        * only if we actually had something on the list passed in.
+        * otherwise we could write outside the array
+        */
+       if (!list_empty(list))
+               str[strlen(str)-1] = '\0';
+       return size;
+}
+
 static int
 libcfs_num_addr_range_expand(struct list_head *addrranges, __u32 *addrs,
                             int max_addrs)
@@ -374,7 +445,7 @@ libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
  * \retval 0 if \a str parsed to numeric address
  * \retval errno otherwise
  */
-static int
+int
 libcfs_num_parse(char *str, int len, struct list_head *list)
 {
        struct cfs_expr_list *el;
@@ -917,6 +988,138 @@ parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist)
        return 0;
 }
 
+static __u32
+libcfs_net_str_len(const char *str)
+{
+       int i;
+       struct netstrfns *nf = NULL;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++) {
+               nf = &libcfs_netstrfns[i];
+               if (!strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+                       return strlen(nf->nf_name);
+       }
+
+       return 0;
+}
+
+int
+parse_net_range(char *str, __u32 len, struct list_head *net_num,
+               __u32 *net_type)
+{
+       struct cfs_lstr next;
+       __u32 net_type_len;
+       __u32 net;
+       char *bracket;
+       char *star;
+
+       if (!str)
+               return -EINVAL;
+
+       next.ls_str = str;
+       next.ls_len = len;
+
+       net_type_len = libcfs_net_str_len(str);
+
+       if (net_type_len < len) {
+               char c = str[net_type_len];
+
+               str[net_type_len] = '\0';
+               net = libcfs_str2net(str);
+               str[net_type_len] = c;
+       } else {
+               net = libcfs_str2net(str);
+       }
+
+       if (net == LNET_NIDNET(LNET_NID_ANY))
+               return -EINVAL;
+
+       *net_type = LNET_NETTYP(net);
+
+       /*
+        * the net is either followed with an absolute number, *, or an
+        * expression enclosed in []
+        */
+       bracket = strchr(next.ls_str, '[');
+       star = strchr(next.ls_str, '*');
+
+       /* "*[" pattern not allowed */
+       if (bracket && star && star < bracket)
+               return -EINVAL;
+
+       if (!bracket) {
+               next.ls_str = str + net_type_len;
+               next.ls_len = strlen(next.ls_str);
+       } else {
+               next.ls_str = bracket;
+               next.ls_len = strlen(bracket);
+       }
+
+       /* if there is no net number just return */
+       if (next.ls_len == 0)
+               return 0;
+
+       return libcfs_num_parse(next.ls_str, next.ls_len,
+                               net_num);
+}
+
+int
+parse_address(struct cfs_lstr *src, const __u32 net_type,
+             struct list_head *addr)
+{
+       int i;
+       struct netstrfns *nf = NULL;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++) {
+               nf = &libcfs_netstrfns[i];
+               if (net_type == nf->nf_type)
+                       return nf->nf_parse_addrlist(src->ls_str, src->ls_len,
+                                                    addr);
+       }
+
+       return -EINVAL;
+}
+
+int
+cfs_parse_nid_parts(char *str, struct list_head *addr,
+                   struct list_head *net_num, __u32 *net_type)
+{
+       struct cfs_lstr next;
+       struct cfs_lstr addrrange;
+       bool found = false;
+       int rc;
+
+       if (!str)
+               return -EINVAL;
+
+       next.ls_str = str;
+       next.ls_len = strlen(str);
+
+       rc = cfs_gettok(&next, '@', &addrrange);
+       if (!rc)
+               return -EINVAL;
+
+       if (!next.ls_str) {
+               /* only net is present */
+               next.ls_str = str;
+               next.ls_len = strlen(str);
+       } else {
+               found = true;
+       }
+
+       /* assume only net is present */
+       rc = parse_net_range(next.ls_str, next.ls_len, net_num, net_type);
+
+       /*
+        * if we successfully parsed the net range and there is no
+        * address, or if we fail to parse the net range then return
+        */
+       if ((!rc && !found) || rc)
+               return rc;
+
+       return parse_address(&addrrange, *net_type, addr);
+}
+
 /**
  * Frees addrrange structures of \a list.
  *
@@ -1027,6 +1230,71 @@ int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist)
        return 0;
 }
 
+static struct netstrfns *
+type2net_info(__u32 net_type)
+{
+       int i;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++) {
+               if (libcfs_netstrfns[i].nf_type == net_type)
+                       return &libcfs_netstrfns[i];
+       }
+
+       return NULL;
+}
+
+int
+cfs_match_net(__u32 net_id, __u32 net_type, struct list_head *net_num_list)
+{
+       __u32 net_num;
+
+       if (!net_num_list)
+               return 0;
+
+       if (net_type != LNET_NETTYP(net_id))
+               return 0;
+
+       net_num = LNET_NETNUM(net_id);
+
+       /*
+        * if there is a net number but the list passed in is empty, then
+        * there is no match.
+        */
+       if (!net_num && list_empty(net_num_list))
+               return 1;
+       else if (list_empty(net_num_list))
+               return 0;
+
+       if (!libcfs_num_match(net_num, net_num_list))
+               return 0;
+
+       return 1;
+}
+
+int
+cfs_match_nid_net(lnet_nid_t nid, __u32 net_type,
+                 struct list_head *net_num_list,
+                 struct list_head *addr)
+{
+       __u32 address;
+       struct netstrfns *fns;
+
+       if (!addr || !net_num_list)
+               return 0;
+
+       fns = type2net_info(LNET_NETTYP(LNET_NIDNET(nid)));
+       if (!fns || !net_num_list || !addr)
+               return 0;
+
+       address = LNET_NIDADDR(nid);
+
+       /* if either the address or net number don't match then no match */
+       if (!fns->nf_match_addr(address, addr) ||
+           !cfs_match_net(LNET_NIDNET(nid), net_type, net_num_list))
+               return 0;
+
+       return 1;
+}
 /**
  * Print the network part of the nidrange \a nr into the specified \a buffer.
  *
index 923074e..b10c1c1 100644 (file)
@@ -2,4 +2,5 @@ EXTRA_DIST = \
        api.h \
        lib-lnet.h \
        lib-types.h \
+       udsp.h \
        socklnd.h
index d245c11..e197ad0 100644 (file)
@@ -212,6 +212,7 @@ lnet_net_lock_current(void)
 extern struct kmem_cache *lnet_mes_cachep;      /* MEs kmem_cache */
 extern struct kmem_cache *lnet_small_mds_cachep; /* <= LNET_SMALL_MD_SIZE bytes
                                                  * MDs kmem_cache */
+extern struct kmem_cache *lnet_udsp_cachep;
 extern struct kmem_cache *lnet_rspt_cachep;
 extern struct kmem_cache *lnet_msg_cachep;
 
@@ -548,6 +549,7 @@ int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg);
 struct lnet_ni *lnet_get_next_ni_locked(struct lnet_net *mynet,
                                        struct lnet_ni *prev);
 struct lnet_ni *lnet_get_ni_idx_locked(int idx);
+int lnet_get_net_healthv_locked(struct lnet_net *net);
 
 extern int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp,
                                struct libcfs_ioctl_hdr __user *uparam);
@@ -555,6 +557,11 @@ extern int lnet_get_peer_list(__u32 *countp, __u32 *sizep,
                              struct lnet_process_id __user *ids);
 extern void lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all);
 extern void lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni);
+extern int lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid);
+extern void lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni);
+extern int lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid);
+void lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni,
+                                        __u32 priority);
 
 void lnet_router_debugfs_init(void);
 void lnet_router_debugfs_fini(void);
@@ -573,6 +580,8 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf);
 int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf);
 int lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason);
 struct lnet_net *lnet_get_net_locked(__u32 net_id);
+void lnet_net_clr_pref_rtrs(struct lnet_net *net);
+int lnet_net_add_pref_rtr(struct lnet_net *net, lnet_nid_t gw_nid);
 
 int lnet_islocalnid(lnet_nid_t nid);
 int lnet_islocalnet(__u32 net);
@@ -712,6 +721,17 @@ bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg);
 void lnet_counters_get_common(struct lnet_counters_common *common);
 int lnet_counters_get(struct lnet_counters *counters);
 void lnet_counters_reset(void);
+static inline void
+lnet_ni_set_sel_priority_locked(struct lnet_ni *ni, __u32 priority)
+{
+       ni->ni_sel_priority = priority;
+}
+
+static inline void
+lnet_net_set_sel_priority_locked(struct lnet_net *net, __u32 priority)
+{
+       net->net_sel_priority = priority;
+}
 
 unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov);
 unsigned int lnet_kiov_nob(unsigned int niov, struct bio_vec *iov);
@@ -878,6 +898,11 @@ void lnet_debug_peer(lnet_nid_t nid);
 struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer,
                                               __u32 net_id);
 bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid);
+int lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid);
+void lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni);
+bool lnet_peer_is_pref_rtr_locked(struct lnet_peer_ni *lpni, lnet_nid_t gw_nid);
+void lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni);
+int lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni, lnet_nid_t nid);
 int lnet_peer_ni_set_non_mr_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid);
 int lnet_add_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid, bool mr);
 int lnet_del_peer_ni(lnet_nid_t key_nid, lnet_nid_t nid);
@@ -890,6 +915,13 @@ int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
                          __u32 *peer_tx_qnob);
 int lnet_get_peer_ni_hstats(struct lnet_ioctl_peer_ni_hstats *stats);
 
+static inline void
+lnet_peer_net_set_sel_priority_locked(struct lnet_peer_net *lpn, __u32 priority)
+{
+       lpn->lpn_sel_priority = priority;
+}
+
+
 static inline struct lnet_peer_net *
 lnet_find_peer_net_locked(struct lnet_peer *peer, __u32 net_id)
 {
@@ -1036,6 +1068,18 @@ lnet_inc_healthv(atomic_t *healthv, int value)
        lnet_atomic_add_unless_max(healthv, value, LNET_MAX_HEALTH_VALUE);
 }
 
+static inline int
+lnet_get_list_len(struct list_head *list)
+{
+       struct list_head *l;
+       int count = 0;
+
+       list_for_each(l, list)
+               count++;
+
+       return count;
+}
+
 void lnet_incr_stats(struct lnet_element_stats *stats,
                     enum lnet_msg_type msg_type,
                     enum lnet_stats_type stats_type);
index 9500550..22a1bdd 100644 (file)
@@ -62,6 +62,7 @@
  * All local and peer NIs created have their health default to this value.
  */
 #define LNET_MAX_HEALTH_VALUE 1000
+#define LNET_MAX_SELECTION_PRIORITY UINT_MAX
 
 /* forward refs */
 struct lnet_libmd;
@@ -369,8 +370,8 @@ struct lnet_net {
         * lnet/include/lnet/nidstr.h */
        __u32                   net_id;
 
-       /* priority of the network */
-       __u32                   net_prio;
+       /* round robin selection */
+       __u32                   net_seq;
 
        /* total number of CPTs in the array */
        __u32                   net_ncpts;
@@ -378,6 +379,9 @@ struct lnet_net {
        /* cumulative CPTs of all NIs in this net */
        __u32                   *net_cpts;
 
+       /* relative net selection priority */
+       __u32                   net_sel_priority;
+
        /* network tunables */
        struct lnet_ioctl_config_lnd_cmn_tunables net_tunables;
 
@@ -404,6 +408,9 @@ struct lnet_net {
 
        /* protects access to net_last_alive */
        spinlock_t              net_lock;
+
+       /* list of router nids preferred for this network */
+       struct list_head        net_rtr_pref_nids;
 };
 
 struct lnet_ni {
@@ -483,6 +490,9 @@ struct lnet_ni {
         */
        atomic_t                ni_fatal_error_on;
 
+       /* the relative selection priority of this NI */
+       __u32                   ni_sel_priority;
+
        /*
         * equivalent interfaces to use
         * This is an array because socklnd bonding can still be configured
@@ -514,6 +524,11 @@ struct lnet_ping_buffer {
 #define LNET_PING_INFO_TO_BUFFER(PINFO)        \
        container_of((PINFO), struct lnet_ping_buffer, pb_info)
 
+struct lnet_nid_list {
+       struct list_head nl_list;
+       lnet_nid_t nl_nid;
+};
+
 struct lnet_peer_ni {
        /* chain on lpn_peer_nis */
        struct list_head        lpni_peer_nis;
@@ -573,8 +588,12 @@ struct lnet_peer_ni {
        /* preferred local nids: if only one, use lpni_pref.nid */
        union lpni_pref {
                lnet_nid_t      nid;
-               lnet_nid_t      *nids;
+               struct list_head nids;
        } lpni_pref;
+       /* list of router nids preferred for this peer NI */
+       struct list_head        lpni_rtr_pref_nids;
+       /* The relative selection priority of this peer NI */
+       __u32                   lpni_sel_priority;
        /* number of preferred NIDs in lnpi_pref_nids */
        __u32                   lpni_pref_nnids;
 };
@@ -768,6 +787,9 @@ struct lnet_peer_net {
        /* selection sequence number */
        __u32                   lpn_seq;
 
+       /* relative peer net selection priority */
+       __u32                   lpn_sel_priority;
+
        /* reference count */
        atomic_t                lpn_refcount;
 };
@@ -979,6 +1001,49 @@ struct lnet_msg_container {
        void                    **msc_resenders;
 };
 
+/* This UDSP structures need to match the user space liblnetconfig structures
+ * in order for the marshall and unmarshall functions to be common.
+ */
+
+/* Net is described as a
+ *  1. net type
+ *  2. num range
+ */
+struct lnet_ud_net_descr {
+       __u32 udn_net_type;
+       struct list_head udn_net_num_range;
+};
+
+/* each NID range is defined as
+ *  1. net descriptor
+ *  2. address range descriptor
+ */
+struct lnet_ud_nid_descr {
+       struct lnet_ud_net_descr ud_net_id;
+       struct list_head ud_addr_range;
+       __u32 ud_mem_size;
+};
+
+/* a UDSP rule can have up to three user defined NID descriptors
+ *     - src: defines the local NID range for the rule
+ *     - dst: defines the peer NID range for the rule
+ *     - rte: defines the router NID range for the rule
+ *
+ * An action union defines the action to take when the rule
+ * is matched
+ */
+struct lnet_udsp {
+       struct list_head udsp_on_list;
+       __u32 udsp_idx;
+       struct lnet_ud_nid_descr udsp_src;
+       struct lnet_ud_nid_descr udsp_dst;
+       struct lnet_ud_nid_descr udsp_rte;
+       enum lnet_udsp_action_type udsp_action_type;
+       union {
+               __u32 udsp_priority;
+       } udsp_action;
+};
+
 /* Peer Discovery states */
 #define LNET_DC_STATE_SHUTDOWN         0       /* not started */
 #define LNET_DC_STATE_RUNNING          1       /* started up OK */
@@ -1154,6 +1219,8 @@ struct lnet {
         * work loops
         */
        struct completion               ln_started;
+       /* UDSP list */
+       struct list_head                ln_udsp_list;
 };
 
 #endif
diff --git a/lnet/include/lnet/udsp.h b/lnet/include/lnet/udsp.h
new file mode 100644 (file)
index 0000000..3ba5a30
--- /dev/null
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ * Copyright (c) 2018-2020 Data Direct Networks.
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   version 2 along with this program; If not, see
+ *   http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * Author: Amir Shehata
+ */
+
+#ifndef UDSP_H
+#define UDSP_H
+
+#include <lnet/lib-lnet.h>
+
+/**
+ * lnet_udsp_add_policy
+ *     Add a policy \new in position \idx
+ *     Must be called with api_mutex held
+ */
+int lnet_udsp_add_policy(struct lnet_udsp *new, int idx);
+
+/**
+ * lnet_udsp_get_policy
+ *     get a policy in position \idx
+ *     Must be called with api_mutex held
+ */
+struct lnet_udsp *lnet_udsp_get_policy(int idx);
+
+/**
+ * lnet_udsp_del_policy
+ *     Delete a policy from position \idx
+ *     Must be called with api_mutex held
+ */
+int lnet_udsp_del_policy(int idx);
+
+/**
+ * lnet_udsp_apply_policies
+ *     apply all stored policies across the system
+ *     Must be called with api_mutex held
+ *     Must NOT be called with lnet_net_lock held
+ *     udsp: NULL to apply on all existing udsps
+ *           non-NULL to apply to specified udsp
+ *     revert: true to revert policy application
+ */
+int lnet_udsp_apply_policies(struct lnet_udsp *udsp, bool revert);
+
+/**
+ * lnet_udsp_apply_policies_on_lpni
+ *     apply all stored policies on specified \lpni
+ *     Must be called with api_mutex held
+ *     Must be called with LNET_LOCK_EX
+ */
+int lnet_udsp_apply_policies_on_lpni(struct lnet_peer_ni *lpni);
+
+/**
+ * lnet_udsp_apply_policies_on_lpn
+ *     Must be called with api_mutex held
+ *     apply all stored policies on specified \lpn
+ *     Must be called with LNET_LOCK_EX
+ */
+int lnet_udsp_apply_policies_on_lpn(struct lnet_peer_net *lpn);
+
+/**
+ * lnet_udsp_apply_policies_on_ni
+ *     apply all stored policies on specified \ni
+ *     Must be called with api_mutex held
+ *     Must be called with LNET_LOCK_EX
+ */
+int lnet_udsp_apply_policies_on_ni(struct lnet_ni *ni);
+
+/**
+ * lnet_udsp_apply_policies_on_net
+ *     apply all stored policies on specified \net
+ *     Must be called with api_mutex held
+ *     Must be called with LNET_LOCK_EX
+ */
+int lnet_udsp_apply_policies_on_net(struct lnet_net *net);
+
+/**
+ * lnet_udsp_alloc
+ *     Allocates a UDSP block and initializes it.
+ *     Return NULL if allocation fails
+ *     pointer to UDSP otherwise.
+ */
+struct lnet_udsp *lnet_udsp_alloc(void);
+
+/**
+ * lnet_udsp_free
+ *     Free a UDSP and all its descriptors
+ */
+void lnet_udsp_free(struct lnet_udsp *udsp);
+
+/**
+ * lnet_udsp_destroy
+ *     Free all the UDSPs
+ *     force: true to indicate shutdown in progress
+ */
+void lnet_udsp_destroy(bool shutdown);
+
+/**
+ * lnet_get_udsp_size
+ *     Return the size needed to store the marshalled UDSP
+ */
+size_t lnet_get_udsp_size(struct lnet_udsp *udsp);
+
+/**
+ * lnet_udsp_marshal
+ *     Marshal the udsp into the bulk memory provided.
+ *     Return success/failure.
+ */
+int lnet_udsp_marshal(struct lnet_udsp *udsp,
+                     struct lnet_ioctl_udsp *ioc_udsp);
+/**
+ * lnet_udsp_demarshal_add
+ *     Given a bulk containing a single UDSP,
+ *     demarshal and populate a udsp structure then add policy
+ */
+int lnet_udsp_demarshal_add(void *bulk, __u32 bulk_size);
+
+/**
+ * lnet_udsp_get_construct_info
+ *     get information of how the UDSP policies impacted the given
+ *     construct.
+ */
+void lnet_udsp_get_construct_info(struct lnet_ioctl_construct_udsp_info *info);
+
+#endif /* UDSP_H */
index e6419f0..4f8bf3c 100644 (file)
@@ -149,7 +149,12 @@ struct libcfs_ioctl_data {
 #define IOC_LIBCFS_SET_HEALHV             _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_GET_LOCAL_HSTATS       _IOWR(IOC_LIBCFS_TYPE, 103, IOCTL_CONFIG_SIZE)
 #define IOC_LIBCFS_GET_RECOVERY_QUEUE     _IOWR(IOC_LIBCFS_TYPE, 104, IOCTL_CONFIG_SIZE)
-#define IOC_LIBCFS_MAX_NR                                        104
+#define IOC_LIBCFS_ADD_UDSP               _IOWR(IOC_LIBCFS_TYPE, 105, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_DEL_UDSP               _IOWR(IOC_LIBCFS_TYPE, 106, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_UDSP_SIZE          _IOWR(IOC_LIBCFS_TYPE, 107, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_UDSP               _IOWR(IOC_LIBCFS_TYPE, 108, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_GET_CONST_UDSP_INFO    _IOWR(IOC_LIBCFS_TYPE, 109, IOCTL_CONFIG_SIZE)
+#define IOC_LIBCFS_MAX_NR                                        109
 
 extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data);
 
index bba8aa3..c37edc8 100644 (file)
@@ -293,4 +293,88 @@ struct lnet_ioctl_lnet_stats {
        struct lnet_counters st_cntrs;
 };
 
+/* An IP, numeric NID or a Net number is composed of 1 or more of these
+ * descriptor structures.
+ */
+struct lnet_range_expr {
+       __u32 re_lo;
+       __u32 re_hi;
+       __u32 re_stride;
+};
+
+/* le_count identifies the number of lnet_range_expr in the bulk
+ * which follows
+ */
+struct lnet_expressions {
+       __u32 le_count;
+};
+
+/* A net descriptor has the net type, IE: O2IBLND, SOCKLND, etc and an
+ * expression describing a net number range.
+ */
+struct lnet_ioctl_udsp_net_descr {
+       __u32 ud_net_type;
+       struct lnet_expressions ud_net_num_expr;
+};
+
+/* The UDSP descriptor header contains the type of matching criteria, SRC,
+ * DST, RTE, etc and how many lnet_expressions compose the LNet portion of
+ * the LNet NID. For example an IP can be
+ * composed of 4 lnet_expressions , a gni can be composed of 1
+ */
+struct lnet_ioctl_udsp_descr_hdr {
+       /* The literals SRC, DST and RTE are encoded
+        * here.
+        */
+       __u32 ud_descr_type;
+       __u32 ud_descr_count;
+};
+
+/* each matching expression in the UDSP is described with this.
+ * The bulk format is as follows:
+ *     1. 1x struct lnet_ioctl_udsp_net_descr
+ *             -> the net part of the NID
+ *     2. >=0 struct lnet_expressions
+ *             -> the address part of the NID
+ */
+struct lnet_ioctl_udsp_descr {
+       struct lnet_ioctl_udsp_descr_hdr iud_src_hdr;
+       struct lnet_ioctl_udsp_net_descr iud_net;
+};
+
+/* The cumulative UDSP descriptor
+ * The bulk format is as follows:
+ *     1. >=1 struct lnet_ioctl_udsp_descr
+ *
+ * The size indicated in iou_hdr is the total size of the UDSP.
+ *
+ */
+struct lnet_ioctl_udsp {
+       struct libcfs_ioctl_hdr iou_hdr;
+       __s32 iou_idx;
+       __u32 iou_action_type;
+       __u32 iou_bulk_size;
+       union {
+               __u32 priority;
+       } iou_action;
+       void __user *iou_bulk;
+};
+
+/* structure used to request udsp instantiation information on the
+ * specified construct.
+ *   cud_nid: the NID of the local or remote NI to pull info on.
+ *   cud_nid_priority: NID prio of the requested NID.
+ *   cud_net_priority: net prio of network of the requested NID.
+ *   cud_pref_nid: array of preferred NIDs if it exists.
+ */
+struct lnet_ioctl_construct_udsp_info {
+       struct libcfs_ioctl_hdr cud_hdr;
+       __u32 cud_peer:1;
+       lnet_nid_t cud_nid;
+       __u32 cud_nid_priority;
+       __u32 cud_net_priority;
+       lnet_nid_t cud_pref_nid[LNET_MAX_SHOW_NUM_NID];
+       lnet_nid_t cud_pref_rtr_nid[LNET_MAX_SHOW_NUM_NID];
+};
+
 #endif /* _LNET_DLC_H_ */
index 2a478ef..cc9ec76 100644 (file)
@@ -701,6 +701,20 @@ enum lnet_ack_req {
        /** Request that no acknowledgment should be generated. */
        LNET_NOACK_REQ
 };
+
+/**
+ * UDSP action types. There are two available actions:
+ *     1. PRIORITY - set priority of matching LNet constructs
+ *     2. PREFERRED LIST - set preferred list of matching LNet constructs
+ */
+enum lnet_udsp_action_type {
+       EN_LNET_UDSP_ACTION_NONE = 0,
+       /** assign a priority to matching constructs */
+       EN_LNET_UDSP_ACTION_PRIORITY = 1,
+       /** assign a preferred list of NIDs to matching constructs */
+       EN_LNET_UDSP_ACTION_PREFERRED_LIST = 2,
+};
+
 /** @} lnet_data */
 
 /** @} lnet */
index a32f379..1e4da91 100644 (file)
@@ -87,17 +87,26 @@ static inline char *libcfs_nid2str(lnet_nid_t nid)
 __u32 libcfs_str2net(const char *str);
 lnet_nid_t libcfs_str2nid(const char *str);
 int libcfs_str2anynid(lnet_nid_t *nid, const char *str);
+int libcfs_num_parse(char *str, int len, struct list_head *list);
 char *libcfs_id2str(struct lnet_process_id id);
 void cfs_free_nidlist(struct list_head *list);
 int cfs_parse_nidlist(char *str, int len, struct list_head *list);
+int cfs_parse_nid_parts(char *str, struct list_head *addr,
+                       struct list_head *net_num, __u32 *net_type);
 int cfs_print_nidlist(char *buffer, int count, struct list_head *list);
 int cfs_match_nid(lnet_nid_t nid, struct list_head *list);
 int cfs_expand_nidlist(struct list_head *nidlist, lnet_nid_t *lnet_nidlist,
                       int max_nids);
+int cfs_match_nid_net(lnet_nid_t nid, __u32 net, struct list_head *net_num_list,
+                     struct list_head *addr);
+int cfs_match_net(__u32 net_id, __u32 net_type,
+                 struct list_head *net_num_list);
+
 int cfs_ip_addr_parse(char *str, int len, struct list_head *list);
 int cfs_ip_addr_match(__u32 addr, struct list_head *list);
 int cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid,
                               char *max_nid, __kernel_size_t nidstr_length);
+void cfs_expr_list_free_list(struct list_head *list);
 
 struct netstrfns {
        __u32   nf_type;
index b55ee89..3d0aac9 100644 (file)
@@ -3,7 +3,7 @@ MODULES := lnet
 lnet-objs := api-ni.o config.o nidstrings.o
 lnet-objs += lib-me.o lib-msg.o lib-md.o lib-ptl.o
 lnet-objs += lib-socket.o lib-move.o module.o lo.o
-lnet-objs += router.o router_proc.o acceptor.o peer.o net_fault.o
+lnet-objs += router.o router_proc.o acceptor.o peer.o net_fault.o udsp.o
 
 default: all
 
index 2297772..a75e2f1 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/sched/signal.h>
 #endif
 
+#include <lnet/udsp.h>
 #include <lnet/lib-lnet.h>
 
 #define D_LNI D_CONSOLE
@@ -604,6 +605,7 @@ lnet_init_locks(void)
 struct kmem_cache *lnet_mes_cachep;       /* MEs kmem_cache */
 struct kmem_cache *lnet_small_mds_cachep;  /* <= LNET_SMALL_MD_SIZE bytes
                                            *  MDs kmem_cache */
+struct kmem_cache *lnet_udsp_cachep;      /* udsp cache */
 struct kmem_cache *lnet_rspt_cachep;      /* response tracker cache */
 struct kmem_cache *lnet_msg_cachep;
 
@@ -624,6 +626,12 @@ lnet_slab_setup(void)
        if (!lnet_small_mds_cachep)
                return -ENOMEM;
 
+       lnet_udsp_cachep = kmem_cache_create("lnet_udsp",
+                                            sizeof(struct lnet_udsp),
+                                            0, 0, NULL);
+       if (!lnet_udsp_cachep)
+               return -ENOMEM;
+
        lnet_rspt_cachep = kmem_cache_create("lnet_rspt", sizeof(struct lnet_rsp_tracker),
                                            0, 0, NULL);
        if (!lnet_rspt_cachep)
@@ -645,12 +653,16 @@ lnet_slab_cleanup(void)
                lnet_msg_cachep = NULL;
        }
 
-
        if (lnet_rspt_cachep) {
                kmem_cache_destroy(lnet_rspt_cachep);
                lnet_rspt_cachep = NULL;
        }
 
+       if (lnet_udsp_cachep) {
+               kmem_cache_destroy(lnet_udsp_cachep);
+               lnet_udsp_cachep = NULL;
+       }
+
        if (lnet_small_mds_cachep) {
                kmem_cache_destroy(lnet_small_mds_cachep);
                lnet_small_mds_cachep = NULL;
@@ -1228,6 +1240,7 @@ lnet_prepare(lnet_pid_t requested_pid)
        INIT_LIST_HEAD(&the_lnet.ln_dc_expired);
        INIT_LIST_HEAD(&the_lnet.ln_mt_localNIRecovq);
        INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
+       INIT_LIST_HEAD(&the_lnet.ln_udsp_list);
        init_waitqueue_head(&the_lnet.ln_dc_waitq);
        the_lnet.ln_mt_handler = NULL;
        init_completion(&the_lnet.ln_started);
@@ -1334,6 +1347,7 @@ lnet_unprepare (void)
                the_lnet.ln_counters = NULL;
        }
        lnet_destroy_remote_nets_table();
+       lnet_udsp_destroy(true);
        lnet_slab_cleanup();
 
        return 0;
@@ -1386,6 +1400,81 @@ lnet_get_net_locked(__u32 net_id)
        return NULL;
 }
 
+void
+lnet_net_clr_pref_rtrs(struct lnet_net *net)
+{
+       struct list_head zombies;
+       struct lnet_nid_list *ne;
+       struct lnet_nid_list *tmp;
+
+       INIT_LIST_HEAD(&zombies);
+
+       lnet_net_lock(LNET_LOCK_EX);
+       list_splice_init(&net->net_rtr_pref_nids, &zombies);
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       list_for_each_entry_safe(ne, tmp, &zombies, nl_list) {
+               list_del_init(&ne->nl_list);
+               LIBCFS_FREE(ne, sizeof(*ne));
+       }
+}
+
+int
+lnet_net_add_pref_rtr(struct lnet_net *net,
+                     lnet_nid_t gw_nid)
+__must_hold(&the_lnet.ln_api_mutex)
+{
+       struct lnet_nid_list *ne;
+
+       /* This function is called with api_mutex held. When the api_mutex
+        * is held the list can not be modified, as it is only modified as
+        * a result of applying a UDSP and that happens under api_mutex
+        * lock.
+        */
+       list_for_each_entry(ne, &net->net_rtr_pref_nids, nl_list) {
+               if (ne->nl_nid == gw_nid)
+                       return -EEXIST;
+       }
+
+       LIBCFS_ALLOC(ne, sizeof(*ne));
+       if (!ne)
+               return -ENOMEM;
+
+       ne->nl_nid = gw_nid;
+
+       /* Lock the cpt to protect against addition and checks in the
+        * selection algorithm
+        */
+       lnet_net_lock(LNET_LOCK_EX);
+       list_add(&ne->nl_list, &net->net_rtr_pref_nids);
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       return 0;
+}
+
+bool
+lnet_net_is_pref_rtr_locked(struct lnet_net *net, lnet_nid_t rtr_nid)
+{
+       struct lnet_nid_list *ne;
+
+       CDEBUG(D_NET, "%s: rtr pref emtpy: %d\n",
+              libcfs_net2str(net->net_id),
+              list_empty(&net->net_rtr_pref_nids));
+
+       if (list_empty(&net->net_rtr_pref_nids))
+               return false;
+
+       list_for_each_entry(ne, &net->net_rtr_pref_nids, nl_list) {
+               CDEBUG(D_NET, "Comparing pref %s with gw %s\n",
+                      libcfs_nid2str(ne->nl_nid),
+                      libcfs_nid2str(rtr_nid));
+               if (rtr_nid == ne->nl_nid)
+                       return true;
+       }
+
+       return false;
+}
+
 unsigned int
 lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number)
 {
@@ -3005,6 +3094,21 @@ lnet_get_ni_idx_locked(int idx)
        return NULL;
 }
 
+int lnet_get_net_healthv_locked(struct lnet_net *net)
+{
+       struct lnet_ni *ni;
+       int best_healthv = 0;
+       int healthv;
+
+       list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+               healthv = atomic_read(&ni->ni_healthv);
+               if (healthv > best_healthv)
+                       best_healthv = healthv;
+       }
+
+       return best_healthv;
+}
+
 struct lnet_ni *
 lnet_get_next_ni_locked(struct lnet_net *mynet, struct lnet_ni *prev)
 {
@@ -3138,12 +3242,13 @@ int lnet_get_ni_stats(struct lnet_ioctl_element_msg_stats *msg_stats)
 static int lnet_add_net_common(struct lnet_net *net,
                               struct lnet_ioctl_config_lnd_tunables *tun)
 {
-       __u32                   net_id;
+       struct lnet_handle_md ping_mdh;
        struct lnet_ping_buffer *pbuf;
-       struct lnet_handle_md   ping_mdh;
-       int                     rc;
        struct lnet_remotenet *rnet;
-       int                     net_ni_count;
+       struct lnet_ni *ni;
+       int net_ni_count;
+       __u32 net_id;
+       int rc;
 
        lnet_net_lock(LNET_LOCK_EX);
        rnet = lnet_find_rnet_locked(net->net_id);
@@ -3193,10 +3298,25 @@ static int lnet_add_net_common(struct lnet_net *net,
 
        lnet_net_lock(LNET_LOCK_EX);
        net = lnet_get_net_locked(net_id);
-       lnet_net_unlock(LNET_LOCK_EX);
-
        LASSERT(net);
 
+       /* apply the UDSPs */
+       rc = lnet_udsp_apply_policies_on_net(net);
+       if (rc)
+               CERROR("Failed to apply UDSPs on local net %s\n",
+                      libcfs_net2str(net->net_id));
+
+       /* At this point we lost track of which NI was just added, so we
+        * just re-apply the policies on all of the NIs on this net
+        */
+       list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+               rc = lnet_udsp_apply_policies_on_ni(ni);
+               if (rc)
+                       CERROR("Failed to apply UDSPs on ni %s\n",
+                              libcfs_nid2str(ni->ni_nid));
+       }
+       lnet_net_unlock(LNET_LOCK_EX);
+
        /*
         * Start the acceptor thread if this is the first network
         * being added that requires the thread.
@@ -4071,6 +4191,106 @@ LNetCtl(unsigned int cmd, void *arg)
                return 0;
        }
 
+       case IOC_LIBCFS_ADD_UDSP: {
+               struct lnet_ioctl_udsp *ioc_udsp = arg;
+               __u32 bulk_size = ioc_udsp->iou_hdr.ioc_len;
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_udsp_demarshal_add(arg, bulk_size);
+               if (!rc) {
+                       rc = lnet_udsp_apply_policies(NULL, false);
+                       CDEBUG(D_NET, "policy application returned %d\n", rc);
+                       rc = 0;
+               }
+               mutex_unlock(&the_lnet.ln_api_mutex);
+
+               return rc;
+       }
+
+       case IOC_LIBCFS_DEL_UDSP: {
+               struct lnet_ioctl_udsp *ioc_udsp = arg;
+               int idx = ioc_udsp->iou_idx;
+
+               if (ioc_udsp->iou_hdr.ioc_len < sizeof(*ioc_udsp))
+                       return -EINVAL;
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               rc = lnet_udsp_del_policy(idx);
+               if (!rc) {
+                       rc = lnet_udsp_apply_policies(NULL, false);
+                       CDEBUG(D_NET, "policy re-application returned %d\n",
+                              rc);
+                       rc = 0;
+               }
+               mutex_unlock(&the_lnet.ln_api_mutex);
+
+               return rc;
+       }
+
+       case IOC_LIBCFS_GET_UDSP_SIZE: {
+               struct lnet_ioctl_udsp *ioc_udsp = arg;
+               struct lnet_udsp *udsp;
+
+               if (ioc_udsp->iou_hdr.ioc_len < sizeof(*ioc_udsp))
+                       return -EINVAL;
+
+               rc = 0;
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               udsp = lnet_udsp_get_policy(ioc_udsp->iou_idx);
+               if (!udsp) {
+                       rc = -ENOENT;
+               } else {
+                       /* coming in iou_idx will hold the idx of the udsp
+                        * to get the size of. going out the iou_idx will
+                        * hold the size of the UDSP found at the passed
+                        * in index.
+                        */
+                       ioc_udsp->iou_idx = lnet_get_udsp_size(udsp);
+                       if (ioc_udsp->iou_idx < 0)
+                               rc = -EINVAL;
+               }
+               mutex_unlock(&the_lnet.ln_api_mutex);
+
+               return rc;
+       }
+
+       case IOC_LIBCFS_GET_UDSP: {
+               struct lnet_ioctl_udsp *ioc_udsp = arg;
+               struct lnet_udsp *udsp;
+
+               if (ioc_udsp->iou_hdr.ioc_len < sizeof(*ioc_udsp))
+                       return -EINVAL;
+
+               rc = 0;
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               udsp = lnet_udsp_get_policy(ioc_udsp->iou_idx);
+               if (!udsp)
+                       rc = -ENOENT;
+               else
+                       rc = lnet_udsp_marshal(udsp, ioc_udsp);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+
+               return rc;
+       }
+
+       case IOC_LIBCFS_GET_CONST_UDSP_INFO: {
+               struct lnet_ioctl_construct_udsp_info *info = arg;
+
+               if (info->cud_hdr.ioc_len < sizeof(*info))
+                       return -EINVAL;
+
+               CDEBUG(D_NET, "GET_UDSP_INFO for %s\n",
+                      libcfs_nid2str(info->cud_nid));
+
+               mutex_lock(&the_lnet.ln_api_mutex);
+               lnet_udsp_get_construct_info(info);
+               mutex_unlock(&the_lnet.ln_api_mutex);
+
+               return 0;
+       }
+
        default:
                ni = lnet_net2ni_addref(data->ioc_net);
                if (ni == NULL)
index 746253b..610100a 100644 (file)
@@ -373,11 +373,14 @@ lnet_net_alloc(__u32 net_id, struct list_head *net_list)
        INIT_LIST_HEAD(&net->net_ni_list);
        INIT_LIST_HEAD(&net->net_ni_added);
        INIT_LIST_HEAD(&net->net_ni_zombie);
+       INIT_LIST_HEAD(&net->net_rtr_pref_nids);
        spin_lock_init(&net->net_lock);
 
        net->net_id = net_id;
        net->net_last_alive = ktime_get_real_seconds();
 
+       net->net_sel_priority = LNET_MAX_SELECTION_PRIORITY;
+
        /* initialize global paramters to undefiend */
        net->net_tunables.lct_peer_timeout = -1;
        net->net_tunables.lct_max_tx_credits = -1;
@@ -481,6 +484,7 @@ lnet_ni_alloc_common(struct lnet_net *net, char *iface)
                ni->ni_net_ns = get_net(&init_net);
 
        ni->ni_state = LNET_NI_STATE_INIT;
+       ni->ni_sel_priority = LNET_MAX_SELECTION_PRIORITY;
        list_add_tail(&ni->ni_netlist, &net->net_ni_added);
 
        /*
index c1e6483..5757b82 100644 (file)
@@ -1283,24 +1283,6 @@ routing_off:
        }
 }
 
-static int
-lnet_compare_gw_lpnis(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2)
-{
-       if (p1->lpni_txqnob < p2->lpni_txqnob)
-               return 1;
-
-       if (p1->lpni_txqnob > p2->lpni_txqnob)
-               return -1;
-
-       if (p1->lpni_txcredits > p2->lpni_txcredits)
-               return 1;
-
-       if (p1->lpni_txcredits < p2->lpni_txcredits)
-               return -1;
-
-       return 0;
-}
-
 static struct lnet_peer_ni *
 lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid,
                    struct lnet_peer *peer,
@@ -1320,9 +1302,11 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid,
                INT_MIN;
        int best_lpni_healthv = (best_lpni) ?
                atomic_read(&best_lpni->lpni_healthv) : 0;
-       bool preferred = false;
-       bool ni_is_pref;
+       bool best_lpni_is_preferred = false;
+       bool lpni_is_preferred;
        int lpni_healthv;
+       __u32 lpni_sel_prio;
+       __u32 best_sel_prio = LNET_MAX_SELECTION_PRIORITY;
 
        while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
                /*
@@ -1330,56 +1314,76 @@ lnet_select_peer_ni(struct lnet_ni *best_ni, lnet_nid_t dst_nid,
                 * preferred, then let's use it
                 */
                if (best_ni) {
-                       ni_is_pref = lnet_peer_is_pref_nid_locked(lpni,
+                       lpni_is_preferred = lnet_peer_is_pref_nid_locked(lpni,
                                                                best_ni->ni_nid);
-                       CDEBUG(D_NET, "%s ni_is_pref = %d\n",
-                              libcfs_nid2str(best_ni->ni_nid), ni_is_pref);
+                       CDEBUG(D_NET, "%s lpni_is_preferred = %d\n",
+                              libcfs_nid2str(best_ni->ni_nid),
+                              lpni_is_preferred);
                } else {
-                       ni_is_pref = false;
+                       lpni_is_preferred = false;
                }
 
                lpni_healthv = atomic_read(&lpni->lpni_healthv);
+               lpni_sel_prio = lpni->lpni_sel_priority;
 
                if (best_lpni)
-                       CDEBUG(D_NET, "%s c:[%d, %d], s:[%d, %d]\n",
+                       CDEBUG(D_NET, "n:[%s, %s] h:[%d, %d] p:[%d, %d] c:[%d, %d] s:[%d, %d]\n",
                                libcfs_nid2str(lpni->lpni_nid),
+                               libcfs_nid2str(best_lpni->lpni_nid),
+                               lpni_healthv, best_lpni_healthv,
+                               lpni_sel_prio, best_sel_prio,
                                lpni->lpni_txcredits, best_lpni_credits,
                                lpni->lpni_seq, best_lpni->lpni_seq);
+               else
+                       goto select_lpni;
 
                /* pick the healthiest peer ni */
-               if (lpni_healthv < best_lpni_healthv) {
+               if (lpni_healthv < best_lpni_healthv)
                        continue;
-               } else if (lpni_healthv > best_lpni_healthv) {
-                       best_lpni_healthv = lpni_healthv;
+               else if (lpni_healthv > best_lpni_healthv) {
+                       if (best_lpni_is_preferred)
+                               best_lpni_is_preferred = false;
+                       goto select_lpni;
+               }
+
+               if (lpni_sel_prio > best_sel_prio)
+                       continue;
+               else if (lpni_sel_prio < best_sel_prio) {
+                       if (best_lpni_is_preferred)
+                               best_lpni_is_preferred = false;
+                       goto select_lpni;
+               }
+
                /* if this is a preferred peer use it */
-               } else if (!preferred && ni_is_pref) {
-                       preferred = true;
-               } else if (preferred && !ni_is_pref) {
-                       /*
-                        * this is not the preferred peer so let's ignore
+               if (!best_lpni_is_preferred && lpni_is_preferred) {
+                       best_lpni_is_preferred = true;
+                       goto select_lpni;
+               } else if (best_lpni_is_preferred && !lpni_is_preferred) {
+                       /* this is not the preferred peer so let's ignore
                         * it.
                         */
                        continue;
-               } else if (lpni->lpni_txcredits < best_lpni_credits) {
-                       /*
-                        * We already have a peer that has more credits
+               }
+
+               if (lpni->lpni_txcredits < best_lpni_credits)
+                       /* We already have a peer that has more credits
                         * available than this one. No need to consider
                         * this peer further.
                         */
                        continue;
-               } else if (lpni->lpni_txcredits == best_lpni_credits) {
-                       /*
-                        * The best peer found so far and the current peer
-                        * have the same number of available credits let's
-                        * make sure to select between them using Round
-                        * Robin
-                        */
-                       if (best_lpni) {
-                               if (best_lpni->lpni_seq <= lpni->lpni_seq)
-                                       continue;
-                       }
-               }
+               else if (lpni->lpni_txcredits > best_lpni_credits)
+                       goto select_lpni;
 
+               /* The best peer found so far and the current peer
+                * have the same number of available credits let's
+                * make sure to select between them using Round Robin
+                */
+               if (best_lpni && (best_lpni->lpni_seq <= lpni->lpni_seq))
+                       continue;
+select_lpni:
+               best_lpni_is_preferred = lpni_is_preferred;
+               best_lpni_healthv = lpni_healthv;
+               best_sel_prio = lpni_sel_prio;
                best_lpni = lpni;
                best_lpni_credits = lpni->lpni_txcredits;
        }
@@ -1437,6 +1441,24 @@ lnet_find_best_lpni(struct lnet_ni *lni, lnet_nid_t dst_nid,
        return NULL;
 }
 
+static int
+lnet_compare_gw_lpnis(struct lnet_peer_ni *lpni1, struct lnet_peer_ni *lpni2)
+{
+       if (lpni1->lpni_txqnob < lpni2->lpni_txqnob)
+               return 1;
+
+       if (lpni1->lpni_txqnob > lpni2->lpni_txqnob)
+               return -1;
+
+       if (lpni1->lpni_txcredits > lpni2->lpni_txcredits)
+               return 1;
+
+       if (lpni1->lpni_txcredits < lpni2->lpni_txcredits)
+               return -1;
+
+       return 0;
+}
+
 /* Compare route priorities and hop counts */
 static int
 lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2)
@@ -1461,6 +1483,7 @@ lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2)
 
 static struct lnet_route *
 lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net,
+                      struct lnet_peer_ni *remote_lpni,
                       struct lnet_route **prev_route,
                       struct lnet_peer_ni **gwni)
 {
@@ -1469,6 +1492,8 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net,
        struct lnet_route *last_route;
        struct lnet_route *route;
        int rc;
+       bool best_rte_is_preferred = false;
+       lnet_nid_t gw_pnid;
 
        CDEBUG(D_NET, "Looking up a route to %s, from %s\n",
               libcfs_net2str(rnet->lrn_net), libcfs_net2str(src_net));
@@ -1477,43 +1502,75 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net,
        list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
                if (!lnet_is_route_alive(route))
                        continue;
+               gw_pnid = route->lr_gateway->lp_primary_nid;
 
-               /*
-                * Restrict the selection of the router NI on the src_net
-                * provided. If the src_net is LNET_NID_ANY, then select
-                * the best interface available.
+               /* no protection on below fields, but it's harmless */
+               if (last_route && (last_route->lr_seq - route->lr_seq < 0))
+                       last_route = route;
+
+               /* if the best route found is in the preferred list then
+                * tag it as preferred and use it later on. But if we
+                * didn't find any routes which are on the preferred list
+                * then just use the best route possible.
                 */
-               if (!best_route) {
+               rc = lnet_peer_is_pref_rtr_locked(remote_lpni, gw_pnid);
+
+               if (!best_route || (rc && !best_rte_is_preferred)) {
+                       /* Restrict the selection of the router NI on the
+                        * src_net provided. If the src_net is LNET_NID_ANY,
+                        * then select the best interface available.
+                        */
                        lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY,
                                                   route->lr_gateway,
                                                   src_net);
-                       if (lpni) {
-                               best_route = last_route = route;
-                               best_gw_ni = lpni;
-                       } else {
-                               CDEBUG(D_NET, "Gateway %s does not have a peer NI on net %s\n",
-                                      libcfs_nid2str(route->lr_gateway->lp_primary_nid),
+                       if (!lpni) {
+                               CDEBUG(D_NET,
+                                      "Gateway %s does not have a peer NI on net %s\n",
+                                      libcfs_nid2str(gw_pnid),
                                       libcfs_net2str(src_net));
+                               continue;
                        }
+               }
+
+               if (rc && !best_rte_is_preferred) {
+                       /* This is the first preferred route we found,
+                        * so it beats any route found previously
+                        */
+                       best_route = route;
+                       if (!last_route)
+                               last_route = route;
+                       best_gw_ni = lpni;
+                       best_rte_is_preferred = true;
+                       CDEBUG(D_NET, "preferred gw = %s\n",
+                              libcfs_nid2str(gw_pnid));
+                       continue;
+               } else if ((!rc) && best_rte_is_preferred)
+                       /* The best route we found so far is in the preferred
+                        * list, so it beats any non-preferred route
+                        */
+                       continue;
 
+               if (!best_route) {
+                       best_route = last_route = route;
+                       best_gw_ni = lpni;
                        continue;
                }
 
-               /* no protection on below fields, but it's harmless */
-               if (last_route->lr_seq - route->lr_seq < 0)
-                       last_route = route;
-
                rc = lnet_compare_routes(route, best_route);
                if (rc == -1)
                        continue;
 
+               /* Restrict the selection of the router NI on the
+                * src_net provided. If the src_net is LNET_NID_ANY,
+                * then select the best interface available.
+                */
                lpni = lnet_find_best_lpni(NULL, LNET_NID_ANY,
                                           route->lr_gateway,
                                           src_net);
-               /* restrict the lpni on the src_net if specified */
                if (!lpni) {
-                       CDEBUG(D_NET, "Gateway %s does not have a peer NI on net %s\n",
-                              libcfs_nid2str(route->lr_gateway->lp_primary_nid),
+                       CDEBUG(D_NET,
+                              "Gateway %s does not have a peer NI on net %s\n",
+                              libcfs_nid2str(gw_pnid),
                               libcfs_net2str(src_net));
                        continue;
                }
@@ -1550,6 +1607,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
        unsigned int shortest_distance;
        int best_credits;
        int best_healthv;
+       __u32 best_sel_prio;
 
        /*
         * If there is no peer_ni that we can send to on this network,
@@ -1559,6 +1617,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                return best_ni;
 
        if (best_ni == NULL) {
+               best_sel_prio = LNET_MAX_SELECTION_PRIORITY;
                shortest_distance = UINT_MAX;
                best_credits = INT_MIN;
                best_healthv = 0;
@@ -1567,6 +1626,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                                                     best_ni->ni_dev_cpt);
                best_credits = atomic_read(&best_ni->ni_tx_credits);
                best_healthv = atomic_read(&best_ni->ni_healthv);
+               best_sel_prio = best_ni->ni_sel_priority;
        }
 
        while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
@@ -1574,10 +1634,12 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                int ni_credits;
                int ni_healthv;
                int ni_fatal;
+               __u32 ni_sel_prio;
 
                ni_credits = atomic_read(&ni->ni_tx_credits);
                ni_healthv = atomic_read(&ni->ni_healthv);
                ni_fatal = atomic_read(&ni->ni_fatal_error_on);
+               ni_sel_prio = ni->ni_sel_priority;
 
                /*
                 * calculate the distance from the CPT on which
@@ -1588,12 +1650,6 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                                            md_cpt,
                                            ni->ni_dev_cpt);
 
-               CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n",
-                      libcfs_nid2str(ni->ni_nid), ni_credits, distance,
-                      ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid)
-                       : "not seleced", best_credits, shortest_distance,
-                       (best_ni) ? best_ni->ni_seq : 0);
-
                /*
                 * All distances smaller than the NUMA range
                 * are treated equally.
@@ -1605,31 +1661,47 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                 * Select on health, shorter distance, available
                 * credits, then round-robin.
                 */
-               if (ni_fatal) {
+               if (ni_fatal)
                        continue;
-               } else if (ni_healthv < best_healthv) {
+
+               if (best_ni)
+                       CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d, p:%u] with best_ni %s [c:%d, d:%d, s:%d, p:%u]\n",
+                              libcfs_nid2str(ni->ni_nid), ni_credits, distance,
+                              ni->ni_seq, ni_sel_prio,
+                              (best_ni) ? libcfs_nid2str(best_ni->ni_nid)
+                              : "not selected", best_credits, shortest_distance,
+                              (best_ni) ? best_ni->ni_seq : 0,
+                              best_sel_prio);
+               else
+                       goto select_ni;
+
+               if (ni_healthv < best_healthv)
                        continue;
-               } else if (ni_healthv > best_healthv) {
-                       best_healthv = ni_healthv;
-                       /*
-                        * If we're going to prefer this ni because it's
-                        * the healthiest, then we should set the
-                        * shortest_distance in the algorithm in case
-                        * there are multiple NIs with the same health but
-                        * different distances.
-                        */
-                       if (distance < shortest_distance)
-                               shortest_distance = distance;
-               } else if (distance > shortest_distance) {
+               else if (ni_healthv > best_healthv)
+                       goto select_ni;
+
+               if (ni_sel_prio > best_sel_prio)
                        continue;
-               } else if (distance < shortest_distance) {
-                       shortest_distance = distance;
-               } else if (ni_credits < best_credits) {
+               else if (ni_sel_prio < best_sel_prio)
+                       goto select_ni;
+
+               if (distance > shortest_distance)
                        continue;
-               } else if (ni_credits == best_credits) {
-                       if (best_ni && best_ni->ni_seq <= ni->ni_seq)
-                               continue;
-               }
+               else if (distance < shortest_distance)
+                       goto select_ni;
+
+               if (ni_credits < best_credits)
+                       continue;
+               else if (ni_credits > best_credits)
+                       goto select_ni;
+
+               if (best_ni && best_ni->ni_seq <= ni->ni_seq)
+                       continue;
+
+select_ni:
+               best_sel_prio = ni_sel_prio;
+               shortest_distance = distance;
+               best_healthv = ni_healthv;
                best_ni = ni;
                best_credits = ni_credits;
        }
@@ -1715,11 +1787,24 @@ lnet_handle_send(struct lnet_send_data *sd)
        __u32 routing = send_case & REMOTE_DST;
         struct lnet_rsp_tracker *rspt;
 
-       /*
-        * Increment sequence number of the selected peer so that we
-        * pick the next one in Round Robin.
+       /* Increment sequence number of the selected peer, peer net,
+        * local ni and local net so that we pick the next ones
+        * in Round Robin.
         */
        best_lpni->lpni_seq++;
+       best_lpni->lpni_peer_net->lpn_seq++;
+       best_ni->ni_seq++;
+       best_ni->ni_net->net_seq++;
+
+       CDEBUG(D_NET, "%s NI seq info: [%d:%d:%d:%u] %s LPNI seq info [%d:%d:%d:%u]\n",
+              libcfs_nid2str(best_ni->ni_nid),
+              best_ni->ni_seq, best_ni->ni_net->net_seq,
+              atomic_read(&best_ni->ni_tx_credits),
+              best_ni->ni_sel_priority,
+              libcfs_nid2str(best_lpni->lpni_nid),
+              best_lpni->lpni_seq, best_lpni->lpni_peer_net->lpn_seq,
+              best_lpni->lpni_txcredits,
+              best_lpni->lpni_sel_priority);
 
        /*
         * grab a reference on the peer_ni so it sticks around even if
@@ -1913,8 +1998,7 @@ struct lnet_ni *
 lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
                              struct lnet_peer *peer,
                              struct lnet_peer_net *peer_net,
-                             int cpt,
-                             bool incr_seq)
+                             int cpt)
 {
        struct lnet_net *local_net;
        struct lnet_ni *best_ni;
@@ -1934,9 +2018,6 @@ lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
        best_ni = lnet_get_best_ni(local_net, cur_best_ni,
                                   peer, peer_net, cpt);
 
-       if (incr_seq && best_ni)
-               best_ni->ni_seq++;
-
        return best_ni;
 }
 
@@ -2009,6 +2090,8 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
        lnet_nid_t src_nid = (sd->sd_src_nid != LNET_NID_ANY) ? sd->sd_src_nid :
                (sd->sd_best_ni != NULL) ? sd->sd_best_ni->ni_nid :
                LNET_NID_ANY;
+       int best_lpn_healthv = 0;
+       __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY;
 
        CDEBUG(D_NET, "using src nid %s for route restriction\n",
               libcfs_nid2str(src_nid));
@@ -2064,9 +2147,22 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
                                        best_rnet = rnet;
                                }
 
-                               if (best_lpn->lpn_seq <= lpn->lpn_seq)
+                               /* select the preferred peer net */
+                               if (best_lpn_healthv > lpn->lpn_healthv)
                                        continue;
+                               else if (best_lpn_healthv < lpn->lpn_healthv)
+                                       goto use_lpn;
 
+                               if (best_lpn_sel_prio < lpn->lpn_sel_priority)
+                                       continue;
+                               else if (best_lpn_sel_prio > lpn->lpn_sel_priority)
+                                       goto use_lpn;
+
+                               if (best_lpn->lpn_seq <= lpn->lpn_seq)
+                                       continue;
+use_lpn:
+                               best_lpn_healthv = lpn->lpn_healthv;
+                               best_lpn_sel_prio = lpn->lpn_sel_priority;
                                best_lpn = lpn;
                                best_rnet = rnet;
                        }
@@ -2109,6 +2205,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
                 */
                best_route = lnet_find_route_locked(best_rnet,
                                                    LNET_NIDNET(src_nid),
+                                                   sd->sd_best_lpni,
                                                    &last_route, &gwni);
 
                if (!best_route) {
@@ -2144,8 +2241,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
                sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, gw,
                                        lnet_peer_get_net_locked(gw,
                                                                 local_lnet),
-                                       sd->sd_md_cpt,
-                                       true);
+                                       sd->sd_md_cpt);
 
        if (!sd->sd_best_ni) {
                CERROR("Internal Error. Expected local ni on %s but non found :%s\n",
@@ -2230,9 +2326,19 @@ struct lnet_ni *
 lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
                               bool discovery)
 {
-       struct lnet_peer_net *peer_net = NULL;
+       struct lnet_peer_net *lpn = NULL;
+       struct lnet_peer_net *best_lpn = NULL;
+       struct lnet_net *net = NULL;
+       struct lnet_net *best_net = NULL;
        struct lnet_ni *best_ni = NULL;
-       int lpn_healthv = 0;
+       int best_lpn_healthv = 0;
+       int best_net_healthv = 0;
+       int net_healthv;
+       __u32 best_lpn_sel_prio = LNET_MAX_SELECTION_PRIORITY;
+       __u32 lpn_sel_prio;
+       __u32 best_net_sel_prio = LNET_MAX_SELECTION_PRIORITY;
+       __u32 net_sel_prio;
+       bool exit = false;
 
        /*
         * The peer can have multiple interfaces, some of them can be on
@@ -2242,35 +2348,82 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
         */
 
        /* go through all the peer nets and find the best_ni */
-       list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+       list_for_each_entry(lpn, &peer->lp_peer_nets, lpn_peer_nets) {
                /*
                 * The peer's list of nets can contain non-local nets. We
                 * want to only examine the local ones.
                 */
-               if (!lnet_get_net_locked(peer_net->lpn_net_id))
+               net = lnet_get_net_locked(lpn->lpn_net_id);
+               if (!net)
                        continue;
 
-               /* always select the lpn with the best health */
-               if (lpn_healthv <= peer_net->lpn_healthv)
-                       lpn_healthv = peer_net->lpn_healthv;
-               else
-                       continue;
-
-               best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, peer_net,
-                                                       md_cpt, false);
+               lpn_sel_prio = lpn->lpn_sel_priority;
+               net_healthv = lnet_get_net_healthv_locked(net);
+               net_sel_prio = net->net_sel_priority;
 
                /*
                 * if this is a discovery message and lp_disc_net_id is
                 * specified then use that net to send the discovery on.
                 */
-               if (peer->lp_disc_net_id == peer_net->lpn_net_id &&
-                   discovery)
+               if (peer->lp_disc_net_id == lpn->lpn_net_id &&
+                   discovery) {
+                       exit = true;
+                       goto select_lpn;
+               }
+
+               if (!best_lpn)
+                       goto select_lpn;
+
+               /* always select the lpn with the best health */
+               if (best_lpn_healthv > lpn->lpn_healthv)
+                       continue;
+               else if (best_lpn_healthv < lpn->lpn_healthv)
+                       goto select_lpn;
+
+               /* select the preferred peer and local nets */
+               if (best_lpn_sel_prio < lpn_sel_prio)
+                       continue;
+               else if (best_lpn_sel_prio > lpn_sel_prio)
+                       goto select_lpn;
+
+               if (best_net_healthv > net_healthv)
+                       continue;
+               else if (best_net_healthv < net_healthv)
+                       goto select_lpn;
+
+               if (best_net_sel_prio < net_sel_prio)
+                       continue;
+               else if (best_net_sel_prio > net_sel_prio)
+                       goto select_lpn;
+
+               if (best_lpn->lpn_seq < lpn->lpn_seq)
+                       continue;
+               else if (best_lpn->lpn_seq > lpn->lpn_seq)
+                       goto select_lpn;
+
+               /* round robin over the local networks */
+               if (best_net->net_seq <= net->net_seq)
+                       continue;
+
+select_lpn:
+               best_net_healthv = net_healthv;
+               best_net_sel_prio = net_sel_prio;
+               best_lpn_healthv = lpn->lpn_healthv;
+               best_lpn_sel_prio = lpn_sel_prio;
+               best_lpn = lpn;
+               best_net = net;
+
+               if (exit)
                        break;
        }
 
-       if (best_ni)
-               /* increment sequence number so we can round robin */
-               best_ni->ni_seq++;
+       if (best_lpn) {
+               /* Select the best NI on the same net as best_lpn chosen
+                * above
+                */
+               best_ni = lnet_find_best_ni_on_spec_net(NULL, peer,
+                                                       best_lpn, md_cpt);
+       }
 
        return best_ni;
 }
@@ -2331,7 +2484,7 @@ lnet_select_preferred_best_ni(struct lnet_send_data *sd)
                best_ni =
                  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
                                                sd->sd_best_lpni->lpni_peer_net,
-                                               sd->sd_md_cpt, true);
+                                               sd->sd_md_cpt);
                /* If there is no best_ni we don't have a route */
                if (!best_ni) {
                        CERROR("no path to %s from net %s\n",
@@ -2387,8 +2540,7 @@ lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd)
                sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL,
                                                               sd->sd_peer,
                                                               sd->sd_best_lpni->lpni_peer_net,
-                                                              sd->sd_md_cpt,
-                                                              true);
+                                                              sd->sd_md_cpt);
                if (!sd->sd_best_ni) {
                        CERROR("Unable to forward message to %s. No local NI available\n",
                               libcfs_nid2str(sd->sd_dst_nid));
@@ -2421,7 +2573,7 @@ lnet_handle_any_mr_dsta(struct lnet_send_data *sd)
                sd->sd_best_ni =
                  lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
                                                sd->sd_best_lpni->lpni_peer_net,
-                                               sd->sd_md_cpt, true);
+                                               sd->sd_md_cpt);
 
                if (!sd->sd_best_ni) {
                        /*
index 26edd33..df82695 100644 (file)
@@ -607,7 +607,7 @@ libcfs_num_str2addr(const char *str, int nob, __u32 *addr)
  * \retval 0 if \a str parsed to numeric address
  * \retval errno otherwise
  */
-static int
+int
 libcfs_num_parse(char *str, int len, struct list_head *list)
 {
        struct cfs_expr_list *el;
@@ -710,6 +710,72 @@ static struct netstrfns libcfs_netstrfns[] = {
 static const size_t libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns);
 
 static struct netstrfns *
+type2net_info(__u32 net_type)
+{
+       int i;
+
+       for (i = 0; i < libcfs_nnetstrfns; i++) {
+               if (libcfs_netstrfns[i].nf_type == net_type)
+                       return &libcfs_netstrfns[i];
+       }
+
+       return NULL;
+}
+
+int
+cfs_match_net(__u32 net_id, __u32 net_type, struct list_head *net_num_list)
+{
+       __u32 net_num;
+
+       if (!net_num_list)
+               return 0;
+
+       if (net_type != LNET_NETTYP(net_id))
+               return 0;
+
+       net_num = LNET_NETNUM(net_id);
+
+       /* if there is a net number but the list passed in is empty, then
+        * there is no match.
+        */
+       if (!net_num && list_empty(net_num_list))
+               return 1;
+       else if (list_empty(net_num_list))
+               return 0;
+
+       if (!libcfs_num_match(net_num, net_num_list))
+               return 0;
+
+       return 1;
+}
+
+int
+cfs_match_nid_net(lnet_nid_t nid, __u32 net_type,
+                 struct list_head *net_num_list,
+                 struct list_head *addr)
+{
+       __u32 address;
+       struct netstrfns *nf;
+
+       if (!addr || !net_num_list)
+               return 0;
+
+       nf = type2net_info(LNET_NETTYP(LNET_NIDNET(nid)));
+       if (!nf || !net_num_list || !addr)
+               return 0;
+
+       address = LNET_NIDADDR(nid);
+
+       /* if either the address or net number don't match then no match */
+       if (!nf->nf_match_addr(address, addr) ||
+           !cfs_match_net(LNET_NIDNET(nid), net_type, net_num_list))
+               return 0;
+
+       return 1;
+}
+EXPORT_SYMBOL(cfs_match_nid_net);
+
+static struct netstrfns *
 libcfs_lnd2netstrfns(__u32 lnd)
 {
        int     i;
index a6943e7..83f52af 100644 (file)
@@ -40,6 +40,7 @@
 #endif
 #include <linux/uaccess.h>
 
+#include <lnet/udsp.h>
 #include <lnet/lib-lnet.h>
 #include <uapi/linux/lnet/lnet-dlc.h>
 
@@ -166,8 +167,10 @@ lnet_peer_ni_alloc(lnet_nid_t nid)
        INIT_LIST_HEAD(&lpni->lpni_peer_nis);
        INIT_LIST_HEAD(&lpni->lpni_recovery);
        INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list);
+       INIT_LIST_HEAD(&lpni->lpni_rtr_pref_nids);
        LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh);
        atomic_set(&lpni->lpni_refcount, 1);
+       lpni->lpni_sel_priority = LNET_MAX_SELECTION_PRIORITY;
 
        spin_lock_init(&lpni->lpni_lock);
 
@@ -217,6 +220,7 @@ lnet_peer_net_alloc(__u32 net_id)
        INIT_LIST_HEAD(&lpn->lpn_peer_nets);
        INIT_LIST_HEAD(&lpn->lpn_peer_nis);
        lpn->lpn_net_id = net_id;
+       lpn->lpn_sel_priority = LNET_MAX_SELECTION_PRIORITY;
 
        CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id));
 
@@ -895,6 +899,94 @@ lnet_push_update_to_peers(int force)
        wake_up(&the_lnet.ln_dc_waitq);
 }
 
+/* find the NID in the preferred gateways for the remote peer
+ * return:
+ *     false: list is not empty and NID is not preferred
+ *     false: list is empty
+ *     true: nid is found in the list
+ */
+bool
+lnet_peer_is_pref_rtr_locked(struct lnet_peer_ni *lpni,
+                            lnet_nid_t gw_nid)
+{
+       struct lnet_nid_list *ne;
+
+       CDEBUG(D_NET, "%s: rtr pref emtpy: %d\n",
+              libcfs_nid2str(lpni->lpni_nid),
+              list_empty(&lpni->lpni_rtr_pref_nids));
+
+       if (list_empty(&lpni->lpni_rtr_pref_nids))
+               return false;
+
+       /* iterate through all the preferred NIDs and see if any of them
+        * matches the provided gw_nid
+        */
+       list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) {
+               CDEBUG(D_NET, "Comparing pref %s with gw %s\n",
+                      libcfs_nid2str(ne->nl_nid),
+                      libcfs_nid2str(gw_nid));
+               if (ne->nl_nid == gw_nid)
+                       return true;
+       }
+
+       return false;
+}
+
+void
+lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni)
+{
+       struct list_head zombies;
+       struct lnet_nid_list *ne;
+       struct lnet_nid_list *tmp;
+       int cpt = lpni->lpni_cpt;
+
+       INIT_LIST_HEAD(&zombies);
+
+       lnet_net_lock(cpt);
+       list_splice_init(&lpni->lpni_rtr_pref_nids, &zombies);
+       lnet_net_unlock(cpt);
+
+       list_for_each_entry_safe(ne, tmp, &zombies, nl_list) {
+               list_del(&ne->nl_list);
+               LIBCFS_FREE(ne, sizeof(*ne));
+       }
+}
+
+int
+lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni,
+                      lnet_nid_t gw_nid)
+{
+       int cpt = lpni->lpni_cpt;
+       struct lnet_nid_list *ne = NULL;
+
+       /* This function is called with api_mutex held. When the api_mutex
+        * is held the list can not be modified, as it is only modified as
+        * a result of applying a UDSP and that happens under api_mutex
+        * lock.
+        */
+       __must_hold(&the_lnet.ln_api_mutex);
+
+       list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) {
+               if (ne->nl_nid == gw_nid)
+                       return -EEXIST;
+       }
+
+       LIBCFS_CPT_ALLOC(ne, lnet_cpt_table(), cpt, sizeof(*ne));
+       if (!ne)
+               return -ENOMEM;
+
+       ne->nl_nid = gw_nid;
+
+       /* Lock the cpt to protect against addition and checks in the
+        * selection algorithm
+        */
+       lnet_net_lock(cpt);
+       list_add(&ne->nl_list, &lpni->lpni_rtr_pref_nids);
+       lnet_net_unlock(cpt);
+
+       return 0;
+}
+
 /*
  * Test whether a ni is a preferred ni for this peer_ni, e.g, whether
  * this is a preferred point-to-point path. Call with lnet_net_lock in
@@ -903,14 +995,14 @@ lnet_push_update_to_peers(int force)
 bool
 lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid)
 {
-       int i;
+       struct lnet_nid_list *ne;
 
        if (lpni->lpni_pref_nnids == 0)
                return false;
        if (lpni->lpni_pref_nnids == 1)
                return lpni->lpni_pref.nid == nid;
-       for (i = 0; i < lpni->lpni_pref_nnids; i++) {
-               if (lpni->lpni_pref.nids[i] == nid)
+       list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) {
+               if (ne->nl_nid == nid)
                        return true;
        }
        return false;
@@ -967,6 +1059,12 @@ lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni)
        return rc;
 }
 
+void
+lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni, __u32 priority)
+{
+       lpni->lpni_sel_priority = priority;
+}
+
 /*
  * Clear the preferred NIDs from a non-multi-rail peer.
  */
@@ -982,11 +1080,10 @@ lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp)
 int
 lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
 {
-       lnet_nid_t *nids = NULL;
-       lnet_nid_t *oldnids = NULL;
        struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
-       int size;
-       int i;
+       struct lnet_nid_list *ne1 = NULL;
+       struct lnet_nid_list *ne2 = NULL;
+       lnet_nid_t tmp_nid = LNET_NID_ANY;
        int rc = 0;
 
        if (nid == LNET_NID_ANY) {
@@ -1000,29 +1097,47 @@ lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
        }
 
        /* A non-MR node may have only one preferred NI per peer_ni */
-       if (lpni->lpni_pref_nnids > 0) {
-               if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
-                       rc = -EPERM;
-                       goto out;
-               }
+       if (lpni->lpni_pref_nnids > 0 &&
+           !(lp->lp_state & LNET_PEER_MULTI_RAIL)) {
+               rc = -EPERM;
+               goto out;
        }
 
+       /* add the new preferred nid to the list of preferred nids */
        if (lpni->lpni_pref_nnids != 0) {
-               size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1);
-               LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size);
-               if (!nids) {
+               size_t alloc_size = sizeof(*ne1);
+
+               if (lpni->lpni_pref_nnids == 1) {
+                       tmp_nid = lpni->lpni_pref.nid;
+                       INIT_LIST_HEAD(&lpni->lpni_pref.nids);
+               }
+
+               list_for_each_entry(ne1, &lpni->lpni_pref.nids, nl_list) {
+                       if (ne1->nl_nid == nid) {
+                               rc = -EEXIST;
+                               goto out;
+                       }
+               }
+
+               LIBCFS_CPT_ALLOC(ne1, lnet_cpt_table(), lpni->lpni_cpt,
+                                alloc_size);
+               if (!ne1) {
                        rc = -ENOMEM;
                        goto out;
                }
-               for (i = 0; i < lpni->lpni_pref_nnids; i++) {
-                       if (lpni->lpni_pref.nids[i] == nid) {
-                               LIBCFS_FREE(nids, size);
-                               rc = -EEXIST;
+
+               /* move the originally stored nid to the list */
+               if (lpni->lpni_pref_nnids == 1) {
+                       LIBCFS_CPT_ALLOC(ne2, lnet_cpt_table(),
+                               lpni->lpni_cpt, alloc_size);
+                       if (!ne2) {
+                               rc = -ENOMEM;
                                goto out;
                        }
-                       nids[i] = lpni->lpni_pref.nids[i];
+                       INIT_LIST_HEAD(&ne2->nl_list);
+                       ne2->nl_nid = tmp_nid;
                }
-               nids[i] = nid;
+               ne1->nl_nid = nid;
        }
 
        lnet_net_lock(LNET_LOCK_EX);
@@ -1030,18 +1145,15 @@ lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
        if (lpni->lpni_pref_nnids == 0) {
                lpni->lpni_pref.nid = nid;
        } else {
-               oldnids = lpni->lpni_pref.nids;
-               lpni->lpni_pref.nids = nids;
+               if (ne2)
+                       list_add_tail(&ne2->nl_list, &lpni->lpni_pref.nids);
+               list_add_tail(&ne1->nl_list, &lpni->lpni_pref.nids);
        }
        lpni->lpni_pref_nnids++;
        lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
        spin_unlock(&lpni->lpni_lock);
        lnet_net_unlock(LNET_LOCK_EX);
 
-       if (oldnids) {
-               size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1);
-               CFS_FREE_PTR_ARRAY(oldnids, size);
-       }
 out:
        if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) {
                spin_lock(&lpni->lpni_lock);
@@ -1056,11 +1168,8 @@ out:
 int
 lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
 {
-       lnet_nid_t *nids = NULL;
-       lnet_nid_t *oldnids = NULL;
        struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer;
-       int size;
-       int i, j;
+       struct lnet_nid_list *ne = NULL;
        int rc = 0;
 
        if (lpni->lpni_pref_nnids == 0) {
@@ -1073,61 +1182,71 @@ lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid)
                        rc = -ENOENT;
                        goto out;
                }
-       } else if (lpni->lpni_pref_nnids == 2) {
-               if (lpni->lpni_pref.nids[0] != nid &&
-                   lpni->lpni_pref.nids[1] != nid) {
-                       rc = -ENOENT;
-                       goto out;
-               }
        } else {
-               size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1);
-               LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size);
-               if (!nids) {
-                       rc = -ENOMEM;
-                       goto out;
-               }
-               for (i = 0, j = 0; i < lpni->lpni_pref_nnids; i++) {
-                       if (lpni->lpni_pref.nids[i] != nid)
-                               continue;
-                       nids[j++] = lpni->lpni_pref.nids[i];
-               }
-               /* Check if we actually removed a nid. */
-               if (j == lpni->lpni_pref_nnids) {
-                       LIBCFS_FREE(nids, size);
-                       rc = -ENOENT;
-                       goto out;
+               list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) {
+                       if (ne->nl_nid == nid)
+                               goto remove_nid_entry;
                }
+               rc = -ENOENT;
+               ne = NULL;
+               goto out;
        }
 
+remove_nid_entry:
        lnet_net_lock(LNET_LOCK_EX);
        spin_lock(&lpni->lpni_lock);
-       if (lpni->lpni_pref_nnids == 1) {
+       if (lpni->lpni_pref_nnids == 1)
                lpni->lpni_pref.nid = LNET_NID_ANY;
-       } else if (lpni->lpni_pref_nnids == 2) {
-               oldnids = lpni->lpni_pref.nids;
-               if (oldnids[0] == nid)
-                       lpni->lpni_pref.nid = oldnids[1];
-               else
-                       lpni->lpni_pref.nid = oldnids[2];
-       } else {
-               oldnids = lpni->lpni_pref.nids;
-               lpni->lpni_pref.nids = nids;
+       else {
+               list_del_init(&ne->nl_list);
+               if (lpni->lpni_pref_nnids == 2) {
+                       struct lnet_nid_list *ne, *tmp;
+
+                       list_for_each_entry_safe(ne, tmp,
+                                                &lpni->lpni_pref.nids,
+                                                nl_list) {
+                               lpni->lpni_pref.nid = ne->nl_nid;
+                               list_del_init(&ne->nl_list);
+                               LIBCFS_FREE(ne, sizeof(*ne));
+                       }
+               }
        }
        lpni->lpni_pref_nnids--;
        lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF;
        spin_unlock(&lpni->lpni_lock);
        lnet_net_unlock(LNET_LOCK_EX);
 
-       if (oldnids) {
-               size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1);
-               CFS_FREE_PTR_ARRAY(oldnids, size);
-       }
+       if (ne)
+               LIBCFS_FREE(ne, sizeof(*ne));
 out:
        CDEBUG(D_NET, "peer %s nid %s: %d\n",
               libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc);
        return rc;
 }
 
+void
+lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni)
+{
+       struct list_head zombies;
+       struct lnet_nid_list *ne;
+       struct lnet_nid_list *tmp;
+
+       INIT_LIST_HEAD(&zombies);
+
+       lnet_net_lock(LNET_LOCK_EX);
+       if (lpni->lpni_pref_nnids == 1)
+               lpni->lpni_pref.nid = LNET_NID_ANY;
+       else if (lpni->lpni_pref_nnids > 1)
+               list_splice_init(&lpni->lpni_pref.nids, &zombies);
+       lpni->lpni_pref_nnids = 0;
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       list_for_each_entry_safe(ne, tmp, &zombies, nl_list) {
+               list_del_init(&ne->nl_list);
+               LIBCFS_FREE(ne, sizeof(*ne));
+       }
+}
+
 lnet_nid_t
 lnet_peer_primary_nid_locked(lnet_nid_t nid)
 {
@@ -1248,6 +1367,8 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp,
                                unsigned flags)
 {
        struct lnet_peer_table *ptable;
+       bool new_lpn = false;
+       int rc;
 
        /* Install the new peer_ni */
        lnet_net_lock(LNET_LOCK_EX);
@@ -1278,6 +1399,7 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp,
 
        /* Add peer_net to peer */
        if (!lpn->lpn_peer) {
+               new_lpn = true;
                lpn->lpn_peer = lp;
                list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets);
                lnet_peer_addref_locked(lp);
@@ -1307,6 +1429,18 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp,
 
        lp->lp_nnis++;
 
+       /* apply UDSPs */
+       if (new_lpn) {
+               rc = lnet_udsp_apply_policies_on_lpn(lpn);
+               if (rc)
+                       CERROR("Failed to apply UDSPs on lpn %s\n",
+                              libcfs_net2str(lpn->lpn_net_id));
+       }
+       rc = lnet_udsp_apply_policies_on_lpni(lpni);
+       if (rc)
+               CERROR("Failed to apply UDSPs on lpni %s\n",
+                      libcfs_nid2str(lpni->lpni_nid));
+
        CDEBUG(D_NET, "peer %s NID %s flags %#x\n",
               libcfs_nid2str(lp->lp_primary_nid),
               libcfs_nid2str(lpni->lpni_nid), flags);
@@ -1719,9 +1853,15 @@ lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
                spin_unlock(&ptable->pt_zombie_lock);
        }
 
-       if (lpni->lpni_pref_nnids > 1)
-               CFS_FREE_PTR_ARRAY(lpni->lpni_pref.nids, lpni->lpni_pref_nnids);
+       if (lpni->lpni_pref_nnids > 1) {
+               struct lnet_nid_list *ne, *tmp;
 
+               list_for_each_entry_safe(ne, tmp, &lpni->lpni_pref.nids,
+                                        nl_list) {
+                       list_del_init(&ne->nl_list);
+                       LIBCFS_FREE(ne, sizeof(*ne));
+               }
+       }
        LIBCFS_FREE(lpni, sizeof(*lpni));
 
        if (lpn)
diff --git a/lnet/lnet/udsp.c b/lnet/lnet/udsp.c
new file mode 100644 (file)
index 0000000..1b0db21
--- /dev/null
@@ -0,0 +1,1547 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ * Copyright (c) 2018-2020 Data Direct Networks.
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   version 2 along with this program; If not, see
+ *   http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ *   lnet/lnet/udsp.c
+ *
+ *   User Defined Selection Policies (UDSP) are introduced to add
+ *   ability of fine traffic control. The policies are instantiated
+ *   on LNet constructs and allow preference of some constructs
+ *   over others as an extension of the selection algorithm.
+ *   The order of operation is defined by the selection algorithm logical flow:
+ *
+ *   1. Iterate over all the networks that a peer can be reached on
+ *      and select the best local network
+ *      - The remote network with the highest priority is examined
+ *        (Network Rule)
+ *      - The local network with the highest priority is selected
+ *        (Network Rule)
+ *      - The local NI with the highest priority is selected
+ *        (NID Rule)
+ *   2. If the peer is a remote peer and has no local networks,
+ *      - then select the remote peer network with the highest priority
+ *        (Network Rule)
+ *      - Select the highest priority remote peer_ni on the network selected
+ *        (NID Rule)
+ *      - Now that the peer's network and NI are decided, select the router
+ *        in round robin from the peer NI's preferred router list.
+ *        (Router Rule)
+ *      - Select the highest priority local NI on the local net of the
+ *        selected route.
+ *        (NID Rule)
+ *   3. Otherwise for local peers, select the peer_ni from the peer.
+ *      - highest priority peer NI is selected
+ *        (NID Rule)
+ *      - Select the peer NI which has the local NI selected on its
+ *        preferred list.
+ *        (NID Pair Rule)
+ *
+ *   Accordingly, the User Interface allows for the following:
+ *   - Adding a local network udsp: if multiple local networks are
+ *     available, each one can have a priority.
+ *   - Adding a local NID udsp: after a local network is chosen,
+ *     if there are multiple NIs, each one can have a priority.
+ *   - Adding a remote NID udsp: assign priority to a peer NID.
+ *   - Adding a NID pair udsp: allows to specify local NIDs
+ *     to be added on the list on the specified peer NIs
+ *     When selecting a peer NI, the one with the
+ *     local NID being used on its list is preferred.
+ *   - Adding a Router udsp: similar to the NID pair udsp.
+ *     Specified router NIDs are added on the list on the specified peer NIs.
+ *     When sending to a remote peer, remote net is selected and the peer NID
+ *     is selected. The router which has its nid on the peer NI list
+ *     is preferred.
+ *   - Deleting a udsp: use the specified policy index to remove it
+ *     from the policy list.
+ *
+ *   Generally, the syntax is as follows
+ *     lnetctl policy <add | del | show>
+ *      --src:      ip2nets syntax specifying the local NID to match
+ *      --dst:      ip2nets syntax specifying the remote NID to match
+ *      --rte:      ip2nets syntax specifying the router NID to match
+ *      --priority: Priority to apply to rule matches
+ *      --idx:      Index of where to insert or delete the rule
+ *                  By default add appends to the end of the rule list
+ *
+ * Author: Amir Shehata
+ */
+
+#include <linux/uaccess.h>
+
+#include <lnet/udsp.h>
+#include <libcfs/libcfs.h>
+
+struct udsp_info {
+       struct lnet_peer_ni *udi_lpni;
+       struct lnet_peer_net *udi_lpn;
+       struct lnet_ni *udi_ni;
+       struct lnet_net *udi_net;
+       struct lnet_ud_nid_descr *udi_match;
+       struct lnet_ud_nid_descr *udi_action;
+       __u32 udi_priority;
+       enum lnet_udsp_action_type udi_type;
+       bool udi_local;
+       bool udi_revert;
+};
+
+typedef int (*udsp_apply_rule)(struct udsp_info *);
+
+enum udsp_apply {
+       UDSP_APPLY_ON_PEERS = 0,
+       UDSP_APPLY_PRIO_ON_NIS = 1,
+       UDSP_APPLY_RTE_ON_NETS = 2,
+       UDSP_APPLY_MAX_ENUM = 3,
+};
+
+#define RULE_NOT_APPLICABLE -1
+
+static inline bool
+lnet_udsp_is_net_rule(struct lnet_ud_nid_descr *match)
+{
+       return list_empty(&match->ud_addr_range);
+}
+
+static bool
+lnet_udsp_expr_list_equal(struct list_head *e1,
+                         struct list_head *e2)
+{
+       struct cfs_expr_list *expr1;
+       struct cfs_expr_list *expr2;
+       struct cfs_range_expr *range1, *range2;
+
+       if (list_empty(e1) && list_empty(e2))
+               return true;
+
+       if (lnet_get_list_len(e1) != lnet_get_list_len(e2))
+               return false;
+
+       expr2 = list_first_entry(e2, struct cfs_expr_list, el_link);
+
+       list_for_each_entry(expr1, e1, el_link) {
+               if (lnet_get_list_len(&expr1->el_exprs) !=
+                   lnet_get_list_len(&expr2->el_exprs))
+                       return false;
+
+               range2 = list_first_entry(&expr2->el_exprs,
+                                         struct cfs_range_expr,
+                                         re_link);
+
+               list_for_each_entry(range1, &expr1->el_exprs, re_link) {
+                       if (range1->re_lo != range2->re_lo ||
+                           range1->re_hi != range2->re_hi ||
+                           range1->re_stride != range2->re_stride)
+                               return false;
+                       range2 = list_next_entry(range2, re_link);
+               }
+               expr2 = list_next_entry(expr2, el_link);
+       }
+
+       return true;
+}
+
+static bool
+lnet_udsp_nid_descr_equal(struct lnet_ud_nid_descr *e1,
+                         struct lnet_ud_nid_descr *e2)
+{
+       if (e1->ud_net_id.udn_net_type != e2->ud_net_id.udn_net_type ||
+           !lnet_udsp_expr_list_equal(&e1->ud_net_id.udn_net_num_range,
+                                      &e2->ud_net_id.udn_net_num_range) ||
+           !lnet_udsp_expr_list_equal(&e1->ud_addr_range, &e2->ud_addr_range))
+               return false;
+
+       return true;
+}
+
+static bool
+lnet_udsp_action_equal(struct lnet_udsp *e1, struct lnet_udsp *e2)
+{
+       if (e1->udsp_action_type != e2->udsp_action_type)
+               return false;
+
+       if (e1->udsp_action_type == EN_LNET_UDSP_ACTION_PRIORITY &&
+           e1->udsp_action.udsp_priority != e2->udsp_action.udsp_priority)
+               return false;
+
+       return true;
+}
+
+static bool
+lnet_udsp_equal(struct lnet_udsp *e1, struct lnet_udsp *e2)
+{
+       /* check each NID descr */
+       if (!lnet_udsp_nid_descr_equal(&e1->udsp_src, &e2->udsp_src) ||
+           !lnet_udsp_nid_descr_equal(&e1->udsp_dst, &e2->udsp_dst) ||
+           !lnet_udsp_nid_descr_equal(&e1->udsp_rte, &e2->udsp_rte))
+               return false;
+
+       return true;
+}
+
+/* it is enough to look at the net type of the descriptor. If the criteria
+ * is present the net must be specified
+ */
+static inline bool
+lnet_udsp_criteria_present(struct lnet_ud_nid_descr *descr)
+{
+       return (descr->ud_net_id.udn_net_type != 0);
+}
+
+static int
+lnet_udsp_apply_rule_on_ni(struct udsp_info *udi)
+{
+       int rc;
+       struct lnet_ni *ni = udi->udi_ni;
+       struct lnet_ud_nid_descr *ni_match = udi->udi_match;
+       __u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority;
+
+       rc = cfs_match_nid_net(ni->ni_nid,
+               ni_match->ud_net_id.udn_net_type,
+               &ni_match->ud_net_id.udn_net_num_range,
+               &ni_match->ud_addr_range);
+       if (!rc)
+               return 0;
+
+       CDEBUG(D_NET, "apply udsp on ni %s\n",
+              libcfs_nid2str(ni->ni_nid));
+
+       /* Detected match. Set NIDs priority */
+       lnet_ni_set_sel_priority_locked(ni, priority);
+
+       return 0;
+}
+
+static int
+lnet_udsp_apply_rte_list_on_net(struct lnet_net *net,
+                               struct lnet_ud_nid_descr *rte_action,
+                               bool revert)
+{
+       struct lnet_remotenet *rnet;
+       struct list_head *rn_list;
+       struct lnet_route *route;
+       struct lnet_peer_ni *lpni;
+       bool cleared = false;
+       lnet_nid_t gw_nid, gw_prim_nid;
+       int rc = 0;
+       int i;
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+               rn_list = &the_lnet.ln_remote_nets_hash[i];
+               list_for_each_entry(rnet, rn_list, lrn_list) {
+                       list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
+                               /* look if gw nid on the same net matches */
+                               gw_prim_nid = route->lr_gateway->lp_primary_nid;
+                               lpni = NULL;
+                               while ((lpni = lnet_get_next_peer_ni_locked(route->lr_gateway,
+                                                                           NULL,
+                                                                           lpni)) != NULL) {
+                                       if (!lnet_get_net_locked(lpni->lpni_peer_net->lpn_net_id))
+                                               continue;
+                                       gw_nid = lpni->lpni_nid;
+                                       rc = cfs_match_nid_net(gw_nid,
+                                               rte_action->ud_net_id.udn_net_type,
+                                               &rte_action->ud_net_id.udn_net_num_range,
+                                               &rte_action->ud_addr_range);
+                                       if (rc)
+                                               break;
+                               }
+                               /* match gw primary nid on a remote network */
+                               if (!rc) {
+                                       gw_nid = gw_prim_nid;
+                                       rc = cfs_match_nid_net(gw_nid,
+                                               rte_action->ud_net_id.udn_net_type,
+                                               &rte_action->ud_net_id.udn_net_num_range,
+                                               &rte_action->ud_addr_range);
+                               }
+                               if (!rc)
+                                       continue;
+                               lnet_net_unlock(LNET_LOCK_EX);
+                               if (!cleared || revert) {
+                                       lnet_net_clr_pref_rtrs(net);
+                                       cleared = true;
+                                       if (revert) {
+                                               lnet_net_lock(LNET_LOCK_EX);
+                                               continue;
+                                       }
+                               }
+                               /* match. Add to pref NIDs */
+                               CDEBUG(D_NET, "udsp net->gw: %s->%s\n",
+                                      libcfs_net2str(net->net_id),
+                                      libcfs_nid2str(gw_prim_nid));
+                               rc = lnet_net_add_pref_rtr(net, gw_prim_nid);
+                               lnet_net_lock(LNET_LOCK_EX);
+                               /* success if EEXIST return */
+                               if (rc && rc != -EEXIST) {
+                                       CERROR("Failed to add %s to %s pref rtr list\n",
+                                              libcfs_nid2str(gw_prim_nid),
+                                              libcfs_net2str(net->net_id));
+                                       return rc;
+                               }
+                       }
+               }
+       }
+
+       return rc;
+}
+
+static int
+lnet_udsp_apply_rte_rule_on_nets(struct udsp_info *udi)
+{
+       int rc = 0;
+       int last_failure = 0;
+       struct lnet_net *net;
+       struct lnet_ud_nid_descr *match = udi->udi_match;
+       struct lnet_ud_nid_descr *rte_action = udi->udi_action;
+
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               if (LNET_NETTYP(net->net_id) != match->ud_net_id.udn_net_type)
+                       continue;
+
+               rc = cfs_match_net(net->net_id,
+                                  match->ud_net_id.udn_net_type,
+                                  &match->ud_net_id.udn_net_num_range);
+               if (!rc)
+                       continue;
+
+               CDEBUG(D_NET, "apply rule on %s\n",
+                      libcfs_net2str(net->net_id));
+               rc = lnet_udsp_apply_rte_list_on_net(net, rte_action,
+                                                    udi->udi_revert);
+               if (rc)
+                       last_failure = rc;
+       }
+
+       return last_failure;
+}
+
+static int
+lnet_udsp_apply_rte_rule_on_net(struct udsp_info *udi)
+{
+       int rc = 0;
+       struct lnet_net *net = udi->udi_net;
+       struct lnet_ud_nid_descr *match = udi->udi_match;
+       struct lnet_ud_nid_descr *rte_action = udi->udi_action;
+
+       rc = cfs_match_net(net->net_id,
+                          match->ud_net_id.udn_net_type,
+                          &match->ud_net_id.udn_net_num_range);
+       if (!rc)
+               return 0;
+
+       CDEBUG(D_NET, "apply rule on %s\n",
+               libcfs_net2str(net->net_id));
+       rc = lnet_udsp_apply_rte_list_on_net(net, rte_action,
+                                            udi->udi_revert);
+
+       return rc;
+}
+
+static int
+lnet_udsp_apply_prio_rule_on_net(struct udsp_info *udi)
+{
+       int rc;
+       struct lnet_ud_nid_descr *match = udi->udi_match;
+       struct lnet_net *net = udi->udi_net;
+       __u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority;
+
+       if (!lnet_udsp_is_net_rule(match))
+               return RULE_NOT_APPLICABLE;
+
+       rc = cfs_match_net(net->net_id,
+                          match->ud_net_id.udn_net_type,
+                          &match->ud_net_id.udn_net_num_range);
+       if (!rc)
+               return 0;
+
+       CDEBUG(D_NET, "apply rule on %s\n",
+              libcfs_net2str(net->net_id));
+
+       lnet_net_set_sel_priority_locked(net, priority);
+
+       return 0;
+}
+
+static int
+lnet_udsp_apply_rule_on_nis(struct udsp_info *udi)
+{
+       int rc = 0;
+       struct lnet_ni *ni;
+       struct lnet_net *net;
+       struct lnet_ud_nid_descr *ni_match = udi->udi_match;
+       int last_failure = 0;
+
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               if (LNET_NETTYP(net->net_id) != ni_match->ud_net_id.udn_net_type)
+                       continue;
+
+               udi->udi_net = net;
+               if (!lnet_udsp_apply_prio_rule_on_net(udi))
+                       continue;
+
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       udi->udi_ni = ni;
+                       rc = lnet_udsp_apply_rule_on_ni(udi);
+                       if (rc)
+                               last_failure = rc;
+               }
+       }
+
+       return last_failure;
+}
+
+static int
+lnet_udsp_apply_rte_list_on_lpni(struct lnet_peer_ni *lpni,
+                                struct lnet_ud_nid_descr *rte_action,
+                                bool revert)
+{
+       struct lnet_remotenet *rnet;
+       struct list_head *rn_list;
+       struct lnet_route *route;
+       bool cleared = false;
+       lnet_nid_t gw_nid;
+       int rc = 0;
+       int i;
+
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+               rn_list = &the_lnet.ln_remote_nets_hash[i];
+               list_for_each_entry(rnet, rn_list, lrn_list) {
+                       list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
+                               gw_nid = route->lr_gateway->lp_primary_nid;
+                               rc = cfs_match_nid_net(gw_nid,
+                                       rte_action->ud_net_id.udn_net_type,
+                                       &rte_action->ud_net_id.udn_net_num_range,
+                                       &rte_action->ud_addr_range);
+                               if (!rc)
+                                       continue;
+                               lnet_net_unlock(LNET_LOCK_EX);
+                               if (!cleared || revert) {
+                                       CDEBUG(D_NET, "%spref rtr nids from lpni %s\n",
+                                              (revert) ? "revert " : "clear ",
+                                              libcfs_nid2str(lpni->lpni_nid));
+                                       lnet_peer_clr_pref_rtrs(lpni);
+                                       cleared = true;
+                                       if (revert) {
+                                               lnet_net_lock(LNET_LOCK_EX);
+                                               continue;
+                                       }
+                               }
+                               CDEBUG(D_NET, "add gw nid %s as preferred for peer %s\n",
+                                      libcfs_nid2str(gw_nid),
+                                      libcfs_nid2str(lpni->lpni_nid));
+                               /* match. Add to pref NIDs */
+                               rc = lnet_peer_add_pref_rtr(lpni, gw_nid);
+                               lnet_net_lock(LNET_LOCK_EX);
+                               /* success if EEXIST return */
+                               if (rc && rc != -EEXIST) {
+                                       CERROR("Failed to add %s to %s pref rtr list\n",
+                                              libcfs_nid2str(gw_nid),
+                                              libcfs_nid2str(lpni->lpni_nid));
+                                       return rc;
+                               }
+                       }
+               }
+       }
+
+       return rc;
+}
+
+static int
+lnet_udsp_apply_ni_list(struct lnet_peer_ni *lpni,
+                       struct lnet_ud_nid_descr *ni_action,
+                       bool revert)
+{
+       int rc = 0;
+       struct lnet_ni *ni;
+       struct lnet_net *net;
+       bool cleared = false;
+
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               if (LNET_NETTYP(net->net_id) != ni_action->ud_net_id.udn_net_type)
+                       continue;
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       rc = cfs_match_nid_net(ni->ni_nid,
+                               ni_action->ud_net_id.udn_net_type,
+                               &ni_action->ud_net_id.udn_net_num_range,
+                               &ni_action->ud_addr_range);
+                       if (!rc)
+                               continue;
+                       lnet_net_unlock(LNET_LOCK_EX);
+                       if (!cleared || revert) {
+                               lnet_peer_clr_pref_nids(lpni);
+                               CDEBUG(D_NET, "%spref nids from lpni %s\n",
+                                       (revert) ? "revert " : "clear ",
+                                       libcfs_nid2str(lpni->lpni_nid));
+                               cleared = true;
+                               if (revert) {
+                                       lnet_net_lock(LNET_LOCK_EX);
+                                       continue;
+                               }
+                       }
+                       CDEBUG(D_NET, "add nid %s as preferred for peer %s\n",
+                               libcfs_nid2str(ni->ni_nid),
+                               libcfs_nid2str(lpni->lpni_nid));
+                       /* match. Add to pref NIDs */
+                       rc = lnet_peer_add_pref_nid(lpni, ni->ni_nid);
+                       lnet_net_lock(LNET_LOCK_EX);
+                       /* success if EEXIST return */
+                       if (rc && rc != -EEXIST) {
+                               CERROR("Failed to add %s to %s pref nid list\n",
+                                       libcfs_nid2str(ni->ni_nid),
+                                       libcfs_nid2str(lpni->lpni_nid));
+                               return rc;
+                       }
+               }
+       }
+
+       return rc;
+}
+
+static int
+lnet_udsp_apply_rule_on_lpni(struct udsp_info *udi)
+{
+       int rc;
+       struct lnet_peer_ni *lpni = udi->udi_lpni;
+       struct lnet_ud_nid_descr *lp_match = udi->udi_match;
+       struct lnet_ud_nid_descr *action = udi->udi_action;
+       __u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority;
+       bool local = udi->udi_local;
+       enum lnet_udsp_action_type type = udi->udi_type;
+
+       rc = cfs_match_nid_net(lpni->lpni_nid,
+               lp_match->ud_net_id.udn_net_type,
+               &lp_match->ud_net_id.udn_net_num_range,
+               &lp_match->ud_addr_range);
+
+       /* check if looking for a net match */
+       if (!rc &&
+           (lnet_get_list_len(&lp_match->ud_addr_range) ||
+            !cfs_match_net(udi->udi_lpn->lpn_net_id,
+                          lp_match->ud_net_id.udn_net_type,
+                          &lp_match->ud_net_id.udn_net_num_range))) {
+               return 0;
+       }
+
+       if (type == EN_LNET_UDSP_ACTION_PREFERRED_LIST && local) {
+               rc = lnet_udsp_apply_ni_list(lpni, action,
+                                            udi->udi_revert);
+               if (rc)
+                       return rc;
+       } else if (type == EN_LNET_UDSP_ACTION_PREFERRED_LIST &&
+                       !local) {
+               rc = lnet_udsp_apply_rte_list_on_lpni(lpni, action,
+                                                     udi->udi_revert);
+               if (rc)
+                       return rc;
+       } else {
+               lnet_peer_ni_set_selection_priority(lpni, priority);
+       }
+
+       return 0;
+}
+
+static int
+lnet_udsp_apply_rule_on_lpn(struct udsp_info *udi)
+{
+       int rc;
+       struct lnet_ud_nid_descr *match = udi->udi_match;
+       struct lnet_peer_net *lpn = udi->udi_lpn;
+       __u32 priority = (udi->udi_revert) ? -1 : udi->udi_priority;
+
+       if (udi->udi_type == EN_LNET_UDSP_ACTION_PREFERRED_LIST ||
+           !lnet_udsp_is_net_rule(match))
+               return RULE_NOT_APPLICABLE;
+
+       rc = cfs_match_net(lpn->lpn_net_id,
+                       match->ud_net_id.udn_net_type,
+                       &match->ud_net_id.udn_net_num_range);
+       if (!rc)
+               return 0;
+
+       CDEBUG(D_NET, "apply rule on lpn %s\n",
+              libcfs_net2str(lpn->lpn_net_id));
+       lnet_peer_net_set_sel_priority_locked(lpn, priority);
+
+       return 0;
+}
+
+static int
+lnet_udsp_apply_rule_on_lpnis(struct udsp_info *udi)
+{
+       /* iterate over all the peers in the system and find if any of the
+        * peers match the criteria. If they do, clear the preferred list
+        * and add the new list
+        */
+       int lncpt = cfs_percpt_number(the_lnet.ln_peer_tables);
+       struct lnet_ud_nid_descr *lp_match = udi->udi_match;
+       struct lnet_peer_table *ptable;
+       struct lnet_peer_net *lpn;
+       struct lnet_peer_ni *lpni;
+       struct lnet_peer *lp;
+       int last_failure = 0;
+       int cpt;
+       int rc;
+
+       for (cpt = 0; cpt < lncpt; cpt++) {
+               ptable = the_lnet.ln_peer_tables[cpt];
+               list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) {
+                       CDEBUG(D_NET, "udsp examining lp %s\n",
+                              libcfs_nid2str(lp->lp_primary_nid));
+                       list_for_each_entry(lpn,
+                                           &lp->lp_peer_nets,
+                                           lpn_peer_nets) {
+                               CDEBUG(D_NET, "udsp examining lpn %s\n",
+                                      libcfs_net2str(lpn->lpn_net_id));
+
+                               if (LNET_NETTYP(lpn->lpn_net_id) !=
+                                   lp_match->ud_net_id.udn_net_type)
+                                       continue;
+
+                               udi->udi_lpn = lpn;
+
+                               if (!lnet_udsp_apply_rule_on_lpn(udi))
+                                       continue;
+
+                               list_for_each_entry(lpni,
+                                                   &lpn->lpn_peer_nis,
+                                                   lpni_peer_nis) {
+                                       CDEBUG(D_NET, "udsp examining lpni %s\n",
+                                              libcfs_nid2str(lpni->lpni_nid));
+                                       udi->udi_lpni = lpni;
+                                       rc = lnet_udsp_apply_rule_on_lpni(udi);
+                                       if (rc)
+                                               last_failure = rc;
+                               }
+                       }
+               }
+       }
+
+       return last_failure;
+}
+
+static int
+lnet_udsp_apply_single_policy(struct lnet_udsp *udsp, struct udsp_info *udi,
+                             udsp_apply_rule *cbs)
+{
+       int rc;
+
+       if (lnet_udsp_criteria_present(&udsp->udsp_dst) &&
+           lnet_udsp_criteria_present(&udsp->udsp_src)) {
+               /* NID Pair rule */
+               if (!cbs[UDSP_APPLY_ON_PEERS])
+                       return 0;
+
+               if (udsp->udsp_action_type !=
+                       EN_LNET_UDSP_ACTION_PREFERRED_LIST) {
+                       CERROR("Bad action type. Expected %d got %d\n",
+                               EN_LNET_UDSP_ACTION_PREFERRED_LIST,
+                               udsp->udsp_action_type);
+                       return 0;
+               }
+               udi->udi_match = &udsp->udsp_dst;
+               udi->udi_action = &udsp->udsp_src;
+               udi->udi_type = EN_LNET_UDSP_ACTION_PREFERRED_LIST;
+               udi->udi_local = true;
+
+               CDEBUG(D_NET, "applying udsp (%p) dst->src\n",
+                       udsp);
+               rc = cbs[UDSP_APPLY_ON_PEERS](udi);
+               if (rc)
+                       return rc;
+       } else if (lnet_udsp_criteria_present(&udsp->udsp_dst) &&
+                  lnet_udsp_criteria_present(&udsp->udsp_rte)) {
+               /* Router rule */
+               if (!cbs[UDSP_APPLY_ON_PEERS])
+                       return 0;
+
+               if (udsp->udsp_action_type !=
+                       EN_LNET_UDSP_ACTION_PREFERRED_LIST) {
+                       CERROR("Bad action type. Expected %d got %d\n",
+                               EN_LNET_UDSP_ACTION_PREFERRED_LIST,
+                               udsp->udsp_action_type);
+                       return 0;
+               }
+
+               if (lnet_udsp_criteria_present(&udsp->udsp_src)) {
+                       CERROR("only one of src or dst can be specified\n");
+                       return 0;
+               }
+               udi->udi_match = &udsp->udsp_dst;
+               udi->udi_action = &udsp->udsp_rte;
+               udi->udi_type = EN_LNET_UDSP_ACTION_PREFERRED_LIST;
+               udi->udi_local = false;
+
+               CDEBUG(D_NET, "applying udsp (%p) dst->rte\n",
+                       udsp);
+               rc = cbs[UDSP_APPLY_ON_PEERS](udi);
+               if (rc)
+                       return rc;
+       } else if (lnet_udsp_criteria_present(&udsp->udsp_dst)) {
+               /* destination priority rule */
+               if (!cbs[UDSP_APPLY_ON_PEERS])
+                       return 0;
+
+               if (udsp->udsp_action_type !=
+                       EN_LNET_UDSP_ACTION_PRIORITY) {
+                       CERROR("Bad action type. Expected %d got %d\n",
+                               EN_LNET_UDSP_ACTION_PRIORITY,
+                               udsp->udsp_action_type);
+                       return 0;
+               }
+               udi->udi_match = &udsp->udsp_dst;
+               udi->udi_type = EN_LNET_UDSP_ACTION_PRIORITY;
+               if (udsp->udsp_action_type !=
+                   EN_LNET_UDSP_ACTION_PRIORITY) {
+                       udi->udi_priority = 0;
+               } else {
+                       udi->udi_priority = udsp->udsp_action.udsp_priority;
+               }
+               udi->udi_local = true;
+
+               CDEBUG(D_NET, "applying udsp (%p) on destination\n",
+                       udsp);
+               rc = cbs[UDSP_APPLY_ON_PEERS](udi);
+               if (rc)
+                       return rc;
+       } else if (lnet_udsp_criteria_present(&udsp->udsp_src)) {
+               /* source priority rule */
+               if (!cbs[UDSP_APPLY_PRIO_ON_NIS])
+                       return 0;
+
+               if (udsp->udsp_action_type !=
+                       EN_LNET_UDSP_ACTION_PRIORITY) {
+                       CERROR("Bad action type. Expected %d got %d\n",
+                               EN_LNET_UDSP_ACTION_PRIORITY,
+                               udsp->udsp_action_type);
+                       return 0;
+               }
+               udi->udi_match = &udsp->udsp_src;
+               udi->udi_type = EN_LNET_UDSP_ACTION_PRIORITY;
+               if (udsp->udsp_action_type !=
+                   EN_LNET_UDSP_ACTION_PRIORITY) {
+                       udi->udi_priority = 0;
+               } else {
+                       udi->udi_priority = udsp->udsp_action.udsp_priority;
+               }
+               udi->udi_local = true;
+
+               CDEBUG(D_NET, "applying udsp (%p) on source\n",
+                       udsp);
+               rc = cbs[UDSP_APPLY_PRIO_ON_NIS](udi);
+       } else {
+               CERROR("Bad UDSP policy\n");
+               return 0;
+       }
+
+       return 0;
+}
+
+static int
+lnet_udsp_apply_policies_helper(struct lnet_udsp *udsp, struct udsp_info *udi,
+                               udsp_apply_rule *cbs)
+{
+       int rc;
+       int last_failure = 0;
+
+       if (udsp)
+               return lnet_udsp_apply_single_policy(udsp, udi, cbs);
+
+       list_for_each_entry_reverse(udsp,
+                                   &the_lnet.ln_udsp_list,
+                                   udsp_on_list) {
+               rc = lnet_udsp_apply_single_policy(udsp, udi, cbs);
+               if (rc)
+                       last_failure = rc;
+       }
+
+       return last_failure;
+}
+
+int
+lnet_udsp_apply_policies_on_ni(struct lnet_ni *ni)
+{
+       struct udsp_info udi;
+       udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+       memset(&udi, 0, sizeof(udi));
+
+       udi.udi_ni = ni;
+
+       cbs[UDSP_APPLY_PRIO_ON_NIS] = lnet_udsp_apply_rule_on_ni;
+
+       return lnet_udsp_apply_policies_helper(NULL, &udi, cbs);
+}
+
+int
+lnet_udsp_apply_policies_on_net(struct lnet_net *net)
+{
+       struct udsp_info udi;
+       udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+       memset(&udi, 0, sizeof(udi));
+
+       udi.udi_net = net;
+
+       cbs[UDSP_APPLY_PRIO_ON_NIS] = lnet_udsp_apply_prio_rule_on_net;
+       cbs[UDSP_APPLY_RTE_ON_NETS] = lnet_udsp_apply_rte_rule_on_net;
+
+       return lnet_udsp_apply_policies_helper(NULL, &udi, cbs);
+}
+
+int
+lnet_udsp_apply_policies_on_lpni(struct lnet_peer_ni *lpni)
+{
+       struct udsp_info udi;
+       udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+       memset(&udi, 0, sizeof(udi));
+
+       udi.udi_lpni = lpni;
+
+       cbs[UDSP_APPLY_ON_PEERS] = lnet_udsp_apply_rule_on_lpni;
+
+       return lnet_udsp_apply_policies_helper(NULL, &udi, cbs);
+}
+
+int
+lnet_udsp_apply_policies_on_lpn(struct lnet_peer_net *lpn)
+{
+       struct udsp_info udi;
+       udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+       memset(&udi, 0, sizeof(udi));
+
+       udi.udi_lpn = lpn;
+
+       cbs[UDSP_APPLY_ON_PEERS] = lnet_udsp_apply_rule_on_lpn;
+
+       return lnet_udsp_apply_policies_helper(NULL, &udi, cbs);
+}
+
+int
+lnet_udsp_apply_policies(struct lnet_udsp *udsp, bool revert)
+{
+       int rc;
+       struct udsp_info udi;
+       udsp_apply_rule cbs[UDSP_APPLY_MAX_ENUM] = {NULL};
+
+       memset(&udi, 0, sizeof(udi));
+
+       cbs[UDSP_APPLY_ON_PEERS] = lnet_udsp_apply_rule_on_lpnis;
+       cbs[UDSP_APPLY_PRIO_ON_NIS] = lnet_udsp_apply_rule_on_nis;
+       cbs[UDSP_APPLY_RTE_ON_NETS] = lnet_udsp_apply_rte_rule_on_nets;
+
+       udi.udi_revert = revert;
+
+       lnet_net_lock(LNET_LOCK_EX);
+       rc = lnet_udsp_apply_policies_helper(udsp, &udi, cbs);
+       lnet_net_unlock(LNET_LOCK_EX);
+
+       return rc;
+}
+
+struct lnet_udsp *
+lnet_udsp_get_policy(int idx)
+{
+       int i = 0;
+       struct lnet_udsp *udsp = NULL;
+       bool found = false;
+
+       CDEBUG(D_NET, "Get UDSP at idx = %d\n", idx);
+
+       if (idx < 0)
+               return NULL;
+
+       list_for_each_entry(udsp, &the_lnet.ln_udsp_list, udsp_on_list) {
+               CDEBUG(D_NET, "iterating over upsp %d:%d:%d\n",
+                      udsp->udsp_idx, i, idx);
+               if (i == idx) {
+                       found = true;
+                       break;
+               }
+               i++;
+       }
+
+       CDEBUG(D_NET, "Found UDSP (%p)\n", udsp);
+
+       if (!found)
+               return NULL;
+
+       return udsp;
+}
+
+int
+lnet_udsp_add_policy(struct lnet_udsp *new, int idx)
+{
+       struct lnet_udsp *udsp;
+       struct lnet_udsp *insert = NULL;
+       int i = 0;
+
+       list_for_each_entry(udsp, &the_lnet.ln_udsp_list, udsp_on_list) {
+               CDEBUG(D_NET, "found udsp i = %d:%d, idx = %d\n",
+                      i, udsp->udsp_idx, idx);
+               if (i == idx) {
+                       insert = udsp;
+                       new->udsp_idx = idx;
+               }
+               i++;
+               if (lnet_udsp_equal(udsp, new)) {
+                       if (!lnet_udsp_action_equal(udsp, new) &&
+                           udsp->udsp_action_type == EN_LNET_UDSP_ACTION_PRIORITY &&
+                           new->udsp_action_type == EN_LNET_UDSP_ACTION_PRIORITY) {
+                               udsp->udsp_action.udsp_priority = new->udsp_action.udsp_priority;
+                               CDEBUG(D_NET, "udsp: %p index %d updated priority to %d\n",
+                                      udsp,
+                                      udsp->udsp_idx,
+                                      udsp->udsp_action.udsp_priority);
+                               return 0;
+                       }
+                       return -EALREADY;
+               }
+       }
+
+       if (insert) {
+               list_add(&new->udsp_on_list, insert->udsp_on_list.prev);
+               i = 0;
+               list_for_each_entry(udsp,
+                                   &the_lnet.ln_udsp_list,
+                                   udsp_on_list) {
+                       if (i <= idx) {
+                               i++;
+                               continue;
+                       }
+                       udsp->udsp_idx++;
+               }
+       } else {
+               list_add_tail(&new->udsp_on_list, &the_lnet.ln_udsp_list);
+               new->udsp_idx = i;
+       }
+
+       CDEBUG(D_NET, "udsp: %p added at index %d\n", new, new->udsp_idx);
+
+       CDEBUG(D_NET, "udsp list:\n");
+       list_for_each_entry(udsp, &the_lnet.ln_udsp_list, udsp_on_list)
+               CDEBUG(D_NET, "udsp %p:%d\n", udsp, udsp->udsp_idx);
+
+       return 0;
+}
+
+int
+lnet_udsp_del_policy(int idx)
+{
+       struct lnet_udsp *udsp;
+       struct lnet_udsp *tmp;
+       bool removed = false;
+
+       if (idx < 0) {
+               lnet_udsp_destroy(false);
+               return 0;
+       }
+
+       CDEBUG(D_NET, "del udsp at idx = %d\n", idx);
+
+       list_for_each_entry_safe(udsp,
+                                tmp,
+                                &the_lnet.ln_udsp_list,
+                                udsp_on_list) {
+               if (removed)
+                       udsp->udsp_idx--;
+               if (udsp->udsp_idx == idx && !removed) {
+                       list_del_init(&udsp->udsp_on_list);
+                       lnet_udsp_apply_policies(udsp, true);
+                       lnet_udsp_free(udsp);
+                       removed = true;
+               }
+       }
+
+       return 0;
+}
+
+static void
+lnet_udsp_get_ni_info(struct lnet_ioctl_construct_udsp_info *info,
+                     struct lnet_ni *ni)
+{
+       struct lnet_nid_list *ne;
+       struct lnet_net *net = ni->ni_net;
+       int i = 0;
+
+       LASSERT(ni);
+
+       info->cud_nid_priority = ni->ni_sel_priority;
+       if (net) {
+               info->cud_net_priority = ni->ni_net->net_sel_priority;
+               list_for_each_entry(ne, &net->net_rtr_pref_nids, nl_list) {
+                       if (i < LNET_MAX_SHOW_NUM_NID)
+                               info->cud_pref_rtr_nid[i] = ne->nl_nid;
+                       else
+                               break;
+                       i++;
+               }
+       }
+}
+
+static void
+lnet_udsp_get_peer_info(struct lnet_ioctl_construct_udsp_info *info,
+                       struct lnet_peer_ni *lpni)
+{
+       struct lnet_nid_list *ne;
+       int i = 0;
+
+       /* peer tree structure needs to be in existence */
+       LASSERT(lpni && lpni->lpni_peer_net &&
+               lpni->lpni_peer_net->lpn_peer);
+
+       info->cud_nid_priority = lpni->lpni_sel_priority;
+       CDEBUG(D_NET, "lpni %s has %d pref nids\n",
+              libcfs_nid2str(lpni->lpni_nid),
+              lpni->lpni_pref_nnids);
+       if (lpni->lpni_pref_nnids == 1) {
+               info->cud_pref_nid[0] = lpni->lpni_pref.nid;
+       } else if (lpni->lpni_pref_nnids > 1) {
+               struct list_head *list = &lpni->lpni_pref.nids;
+
+               list_for_each_entry(ne, list, nl_list) {
+                       if (i < LNET_MAX_SHOW_NUM_NID)
+                               info->cud_pref_nid[i] = ne->nl_nid;
+                       else
+                               break;
+                       i++;
+               }
+       }
+
+       i = 0;
+       list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) {
+               if (i < LNET_MAX_SHOW_NUM_NID)
+                       info->cud_pref_rtr_nid[i] = ne->nl_nid;
+               else
+                       break;
+               i++;
+       }
+
+       info->cud_net_priority = lpni->lpni_peer_net->lpn_sel_priority;
+}
+
+void
+lnet_udsp_get_construct_info(struct lnet_ioctl_construct_udsp_info *info)
+{
+       struct lnet_ni *ni;
+       struct lnet_peer_ni *lpni;
+
+       lnet_net_lock(0);
+       if (!info->cud_peer) {
+               ni = lnet_nid2ni_locked(info->cud_nid, 0);
+               if (ni)
+                       lnet_udsp_get_ni_info(info, ni);
+       } else {
+               lpni = lnet_find_peer_ni_locked(info->cud_nid);
+               if (!lpni) {
+                       CDEBUG(D_NET, "nid %s is not found\n",
+                              libcfs_nid2str(info->cud_nid));
+               } else {
+                       lnet_udsp_get_peer_info(info, lpni);
+                       lnet_peer_ni_decref_locked(lpni);
+               }
+       }
+       lnet_net_unlock(0);
+}
+
+struct lnet_udsp *
+lnet_udsp_alloc(void)
+{
+       struct lnet_udsp *udsp;
+
+       udsp = kmem_cache_alloc(lnet_udsp_cachep, GFP_NOFS | __GFP_ZERO);
+
+       if (!udsp)
+               return NULL;
+
+       INIT_LIST_HEAD(&udsp->udsp_on_list);
+       INIT_LIST_HEAD(&udsp->udsp_src.ud_addr_range);
+       INIT_LIST_HEAD(&udsp->udsp_src.ud_net_id.udn_net_num_range);
+       INIT_LIST_HEAD(&udsp->udsp_dst.ud_addr_range);
+       INIT_LIST_HEAD(&udsp->udsp_dst.ud_net_id.udn_net_num_range);
+       INIT_LIST_HEAD(&udsp->udsp_rte.ud_addr_range);
+       INIT_LIST_HEAD(&udsp->udsp_rte.ud_net_id.udn_net_num_range);
+
+       CDEBUG(D_MALLOC, "udsp alloc %p\n", udsp);
+       return udsp;
+}
+
+static void
+lnet_udsp_nid_descr_free(struct lnet_ud_nid_descr *nid_descr)
+{
+       struct list_head *net_range = &nid_descr->ud_net_id.udn_net_num_range;
+
+       if (!lnet_udsp_criteria_present(nid_descr))
+               return;
+
+       /* memory management is a bit tricky here. When we allocate the
+        * memory to store the NID descriptor we allocate a large buffer
+        * for all the data, so we need to free the entire buffer at
+        * once. If the net is present the net_range->next points to that
+        * buffer otherwise if the ud_addr_range is present then it's the
+        * ud_addr_range.next
+        */
+       if (!list_empty(net_range))
+               LIBCFS_FREE(net_range->next, nid_descr->ud_mem_size);
+       else if (!list_empty(&nid_descr->ud_addr_range))
+               LIBCFS_FREE(nid_descr->ud_addr_range.next,
+                           nid_descr->ud_mem_size);
+}
+
+void
+lnet_udsp_free(struct lnet_udsp *udsp)
+{
+       lnet_udsp_nid_descr_free(&udsp->udsp_src);
+       lnet_udsp_nid_descr_free(&udsp->udsp_dst);
+       lnet_udsp_nid_descr_free(&udsp->udsp_rte);
+
+       CDEBUG(D_MALLOC, "udsp free %p\n", udsp);
+       kmem_cache_free(lnet_udsp_cachep, udsp);
+}
+
+void
+lnet_udsp_destroy(bool shutdown)
+{
+       struct lnet_udsp *udsp, *tmp;
+
+       CDEBUG(D_NET, "Destroying UDSPs in the system\n");
+
+       list_for_each_entry_safe(udsp, tmp, &the_lnet.ln_udsp_list,
+                                udsp_on_list) {
+               list_del(&udsp->udsp_on_list);
+               if (!shutdown)
+                       lnet_udsp_apply_policies(udsp, true);
+               lnet_udsp_free(udsp);
+       }
+}
+
+static size_t
+lnet_size_marshaled_nid_descr(struct lnet_ud_nid_descr *descr)
+{
+       struct cfs_expr_list *expr;
+       int expr_count = 0;
+       int range_count = 0;
+       size_t size = sizeof(struct lnet_ioctl_udsp_descr);
+
+       if (!lnet_udsp_criteria_present(descr))
+               return size;
+
+       /* we always have one net expression */
+       if (!list_empty(&descr->ud_net_id.udn_net_num_range)) {
+               expr = list_first_entry(&descr->ud_net_id.udn_net_num_range,
+                                       struct cfs_expr_list, el_link);
+
+               /* count the number of cfs_range_expr in the net expression */
+               range_count = lnet_get_list_len(&expr->el_exprs);
+       }
+
+       /* count the number of cfs_range_expr in the address expressions */
+       list_for_each_entry(expr, &descr->ud_addr_range, el_link) {
+               expr_count++;
+               range_count += lnet_get_list_len(&expr->el_exprs);
+       }
+
+       size += (sizeof(struct lnet_expressions) * expr_count);
+       size += (sizeof(struct lnet_range_expr) * range_count);
+
+       return size;
+}
+
+size_t
+lnet_get_udsp_size(struct lnet_udsp *udsp)
+{
+       size_t size = sizeof(struct lnet_ioctl_udsp);
+
+       size += lnet_size_marshaled_nid_descr(&udsp->udsp_src);
+       size += lnet_size_marshaled_nid_descr(&udsp->udsp_dst);
+       size += lnet_size_marshaled_nid_descr(&udsp->udsp_rte);
+
+       CDEBUG(D_NET, "get udsp (%p) size: %d\n", udsp, (int)size);
+
+       return size;
+}
+
+static int
+copy_exprs(struct cfs_expr_list *expr, void __user **bulk,
+          __u32 *bulk_size)
+{
+       struct cfs_range_expr *range;
+       struct lnet_range_expr range_expr;
+
+       /* copy over the net range expressions to the bulk */
+       list_for_each_entry(range, &expr->el_exprs, re_link) {
+               range_expr.re_lo = range->re_lo;
+               range_expr.re_hi = range->re_hi;
+               range_expr.re_stride = range->re_stride;
+               CDEBUG(D_NET, "Copy Range %u:%u:%u\n",
+                      range_expr.re_lo, range_expr.re_hi,
+                      range_expr.re_stride);
+               if (copy_to_user(*bulk, &range_expr, sizeof(range_expr))) {
+                       CDEBUG(D_NET, "Failed to copy range_expr\n");
+                       return -EFAULT;
+               }
+               *bulk += sizeof(range_expr);
+               *bulk_size -= sizeof(range_expr);
+       }
+
+       return 0;
+}
+
+static int
+copy_nid_range(struct lnet_ud_nid_descr *nid_descr, char *type,
+               void __user **bulk, __u32 *bulk_size)
+{
+       struct lnet_ioctl_udsp_descr ioc_udsp_descr;
+       struct cfs_expr_list *expr;
+       struct lnet_expressions ioc_expr;
+       int expr_count;
+       int net_expr_count;
+       int rc;
+
+       memset(&ioc_udsp_descr, 0, sizeof(ioc_udsp_descr));
+       ioc_udsp_descr.iud_src_hdr.ud_descr_type = *(__u32 *)type;
+
+       /* if criteria not present, copy over the static part of the NID
+        * descriptor
+        */
+       if (!lnet_udsp_criteria_present(nid_descr)) {
+               CDEBUG(D_NET, "Descriptor %u:%u:%u:%u\n",
+                      ioc_udsp_descr.iud_src_hdr.ud_descr_type,
+                      ioc_udsp_descr.iud_src_hdr.ud_descr_count,
+                      ioc_udsp_descr.iud_net.ud_net_type,
+                      ioc_udsp_descr.iud_net.ud_net_num_expr.le_count);
+               if (copy_to_user(*bulk, &ioc_udsp_descr,
+                                sizeof(ioc_udsp_descr))) {
+                       CDEBUG(D_NET, "failed to copy ioc_udsp_descr\n");
+                       return -EFAULT;
+               }
+               *bulk += sizeof(ioc_udsp_descr);
+               *bulk_size -= sizeof(ioc_udsp_descr);
+               return 0;
+       }
+
+       expr_count = lnet_get_list_len(&nid_descr->ud_addr_range);
+
+       /* copy the net information */
+       if (!list_empty(&nid_descr->ud_net_id.udn_net_num_range)) {
+               expr = list_first_entry(&nid_descr->ud_net_id.udn_net_num_range,
+                                       struct cfs_expr_list, el_link);
+               net_expr_count = lnet_get_list_len(&expr->el_exprs);
+       } else {
+               net_expr_count = 0;
+       }
+
+       /* set the total expression count */
+       ioc_udsp_descr.iud_src_hdr.ud_descr_count = expr_count;
+       ioc_udsp_descr.iud_net.ud_net_type =
+               nid_descr->ud_net_id.udn_net_type;
+       ioc_udsp_descr.iud_net.ud_net_num_expr.le_count = net_expr_count;
+
+       CDEBUG(D_NET, "Descriptor %u:%u:%u:%u\n",
+               ioc_udsp_descr.iud_src_hdr.ud_descr_type,
+               ioc_udsp_descr.iud_src_hdr.ud_descr_count,
+               ioc_udsp_descr.iud_net.ud_net_type,
+               ioc_udsp_descr.iud_net.ud_net_num_expr.le_count);
+
+       /* copy over the header info to the bulk */
+       if (copy_to_user(*bulk, &ioc_udsp_descr, sizeof(ioc_udsp_descr))) {
+               CDEBUG(D_NET, "Failed to copy data\n");
+               return -EFAULT;
+       }
+       *bulk += sizeof(ioc_udsp_descr);
+       *bulk_size -= sizeof(ioc_udsp_descr);
+
+       /* copy over the net num expression if it exists */
+       if (net_expr_count) {
+               rc = copy_exprs(expr, bulk, bulk_size);
+               if (rc)
+                       return rc;
+       }
+
+       /* copy the address range */
+       list_for_each_entry(expr, &nid_descr->ud_addr_range, el_link) {
+               ioc_expr.le_count = lnet_get_list_len(&expr->el_exprs);
+               if (copy_to_user(*bulk, &ioc_expr, sizeof(ioc_expr))) {
+                       CDEBUG(D_NET, "failex to copy ioc_expr\n");
+                       return -EFAULT;
+               }
+               *bulk += sizeof(ioc_expr);
+               *bulk_size -= sizeof(ioc_expr);
+
+               rc = copy_exprs(expr, bulk, bulk_size);
+               if (rc)
+                       return rc;
+       }
+
+       return 0;
+}
+
+int
+lnet_udsp_marshal(struct lnet_udsp *udsp, struct lnet_ioctl_udsp *ioc_udsp)
+{
+       int rc = -ENOMEM;
+       void __user *bulk;
+       __u32 bulk_size;
+
+       if (!ioc_udsp)
+               return -EINVAL;
+
+       bulk = ioc_udsp->iou_bulk;
+       bulk_size = ioc_udsp->iou_hdr.ioc_len +
+         ioc_udsp->iou_bulk_size;
+
+       CDEBUG(D_NET, "marshal udsp (%p)\n", udsp);
+       CDEBUG(D_NET, "MEM -----> bulk: %p:0x%x\n", bulk, bulk_size);
+       /* make sure user space allocated enough buffer to marshal the
+        * udsp
+        */
+       if (bulk_size != lnet_get_udsp_size(udsp)) {
+               rc = -ENOSPC;
+               goto fail;
+       }
+
+       ioc_udsp->iou_idx = udsp->udsp_idx;
+       ioc_udsp->iou_action_type = udsp->udsp_action_type;
+       ioc_udsp->iou_action.priority = udsp->udsp_action.udsp_priority;
+
+       bulk_size -= sizeof(*ioc_udsp);
+
+       rc = copy_nid_range(&udsp->udsp_src, "SRC", &bulk, &bulk_size);
+       if (rc)
+               goto fail;
+
+       rc = copy_nid_range(&udsp->udsp_dst, "DST", &bulk, &bulk_size);
+       if (rc)
+               goto fail;
+
+       rc = copy_nid_range(&udsp->udsp_rte, "RTE", &bulk, &bulk_size);
+       if (rc)
+               goto fail;
+
+       CDEBUG(D_NET, "MEM <----- bulk: %p\n", bulk);
+
+       /* we should've consumed the entire buffer */
+       LASSERT(bulk_size == 0);
+       return 0;
+
+fail:
+       CERROR("Failed to marshal udsp: %d\n", rc);
+       return rc;
+}
+
+static void
+copy_range_info(void **bulk, void **buf, struct list_head *list,
+               int count)
+{
+       struct lnet_range_expr *range_expr;
+       struct cfs_range_expr *range;
+       struct cfs_expr_list *exprs;
+       int range_count = count;
+       int i;
+
+       if (range_count == 0)
+               return;
+
+       if (range_count == -1) {
+               struct lnet_expressions *e;
+
+               e = *bulk;
+               range_count = e->le_count;
+               *bulk += sizeof(*e);
+       }
+
+       exprs = *buf;
+       INIT_LIST_HEAD(&exprs->el_link);
+       INIT_LIST_HEAD(&exprs->el_exprs);
+       list_add_tail(&exprs->el_link, list);
+       *buf += sizeof(*exprs);
+
+       for (i = 0; i < range_count; i++) {
+               range_expr = *bulk;
+               range = *buf;
+               INIT_LIST_HEAD(&range->re_link);
+               range->re_lo = range_expr->re_lo;
+               range->re_hi = range_expr->re_hi;
+               range->re_stride = range_expr->re_stride;
+               CDEBUG(D_NET, "Copy Range %u:%u:%u\n",
+                      range->re_lo,
+                      range->re_hi,
+                      range->re_stride);
+               list_add_tail(&range->re_link, &exprs->el_exprs);
+               *bulk += sizeof(*range_expr);
+               *buf += sizeof(*range);
+       }
+}
+
+static int
+copy_ioc_udsp_descr(struct lnet_ud_nid_descr *nid_descr, char *type,
+                   void **bulk, __u32 *bulk_size)
+{
+       struct lnet_ioctl_udsp_descr *ioc_nid = *bulk;
+       struct lnet_expressions *exprs;
+       __u32 descr_type;
+       int expr_count = 0;
+       int range_count = 0;
+       int i;
+       __u32 size;
+       int remaining_size = *bulk_size;
+       void *tmp = *bulk;
+       __u32 alloc_size;
+       void *buf;
+       size_t range_expr_s = sizeof(struct lnet_range_expr);
+       size_t lnet_exprs_s = sizeof(struct lnet_expressions);
+
+       CDEBUG(D_NET, "%s: bulk = %p:%u\n", type, *bulk, *bulk_size);
+
+       /* criteria not present, skip over the static part of the
+        * bulk, which is included for each NID descriptor
+        */
+       if (ioc_nid->iud_net.ud_net_type == 0) {
+               remaining_size -= sizeof(*ioc_nid);
+               if (remaining_size < 0) {
+                       CERROR("Truncated userspace udsp buffer given\n");
+                       return -EINVAL;
+               }
+               *bulk += sizeof(*ioc_nid);
+               *bulk_size = remaining_size;
+               return 0;
+       }
+
+       descr_type = ioc_nid->iud_src_hdr.ud_descr_type;
+       if (descr_type != *(__u32 *)type) {
+               CERROR("Bad NID descriptor type. Expected %s, given %c%c%c\n",
+                       type, (__u8)descr_type, (__u8)(descr_type << 4),
+                       (__u8)(descr_type << 8));
+               return -EINVAL;
+       }
+
+       /* calculate the total size to verify we have enough buffer.
+        * Start of by finding how many ranges there are for the net
+        * expression.
+        */
+       range_count = ioc_nid->iud_net.ud_net_num_expr.le_count;
+       size = sizeof(*ioc_nid) + (range_count * range_expr_s);
+       remaining_size -= size;
+       if (remaining_size < 0) {
+               CERROR("Truncated userspace udsp buffer given\n");
+               return -EINVAL;
+       }
+
+       CDEBUG(D_NET, "Total net num ranges in %s: %d:%u\n", type,
+              range_count, size);
+       /* the number of expressions for the NID. IE 4 for IP, 1 for GNI */
+       expr_count = ioc_nid->iud_src_hdr.ud_descr_count;
+       CDEBUG(D_NET, "addr as %d exprs\n", expr_count);
+       /* point tmp to the beginning of the NID expressions */
+       tmp += size;
+       for (i = 0; i < expr_count; i++) {
+               /* get the number of ranges per expression */
+               exprs = tmp;
+               range_count += exprs->le_count;
+               size = (range_expr_s * exprs->le_count) + lnet_exprs_s;
+               remaining_size -= size;
+               CDEBUG(D_NET, "expr %d:%d:%u:%d:%d\n", i, exprs->le_count,
+                      size, remaining_size, range_count);
+               if (remaining_size < 0) {
+                       CERROR("Truncated userspace udsp buffer given\n");
+                       return -EINVAL;
+               }
+               tmp += size;
+       }
+
+       *bulk_size = remaining_size;
+
+       /* copy over the net type */
+       nid_descr->ud_net_id.udn_net_type = ioc_nid->iud_net.ud_net_type;
+
+       CDEBUG(D_NET, "%u\n", nid_descr->ud_net_id.udn_net_type);
+
+       /* allocate the total memory required to copy this NID descriptor */
+       alloc_size = (sizeof(struct cfs_expr_list) * (expr_count + 1)) +
+                    (sizeof(struct cfs_range_expr) * (range_count));
+       LIBCFS_ALLOC(buf, alloc_size);
+       if (!buf)
+               return -ENOMEM;
+
+       /* store the amount of memory allocated so we can free it later on */
+       nid_descr->ud_mem_size = alloc_size;
+
+       /* copy over the net number range */
+       range_count = ioc_nid->iud_net.ud_net_num_expr.le_count;
+       *bulk += sizeof(*ioc_nid);
+       CDEBUG(D_NET, "bulk = %p\n", *bulk);
+       copy_range_info(bulk, &buf, &nid_descr->ud_net_id.udn_net_num_range,
+                       range_count);
+       CDEBUG(D_NET, "bulk = %p\n", *bulk);
+
+       /* copy over the NID descriptor */
+       for (i = 0; i < expr_count; i++) {
+               copy_range_info(bulk, &buf, &nid_descr->ud_addr_range, -1);
+               CDEBUG(D_NET, "bulk = %p\n", *bulk);
+       }
+
+       return 0;
+}
+
+int
+lnet_udsp_demarshal_add(void *bulk, __u32 bulk_size)
+{
+       struct lnet_ioctl_udsp *ioc_udsp;
+       struct lnet_udsp *udsp;
+       int rc = -ENOMEM;
+       int idx;
+
+       if (bulk_size < sizeof(*ioc_udsp))
+               return -ENOSPC;
+
+       udsp = lnet_udsp_alloc();
+       if (!udsp)
+               return rc;
+
+       ioc_udsp = bulk;
+
+       udsp->udsp_action_type = ioc_udsp->iou_action_type;
+       udsp->udsp_action.udsp_priority = ioc_udsp->iou_action.priority;
+       idx = ioc_udsp->iou_idx;
+
+       CDEBUG(D_NET, "demarshal descr %u:%u:%d:%u\n", udsp->udsp_action_type,
+              udsp->udsp_action.udsp_priority, idx, bulk_size);
+
+       bulk += sizeof(*ioc_udsp);
+       bulk_size -= sizeof(*ioc_udsp);
+
+       rc = copy_ioc_udsp_descr(&udsp->udsp_src, "SRC", &bulk, &bulk_size);
+       if (rc < 0)
+               goto free_udsp;
+
+       rc = copy_ioc_udsp_descr(&udsp->udsp_dst, "DST", &bulk, &bulk_size);
+       if (rc < 0)
+               goto free_udsp;
+
+       rc = copy_ioc_udsp_descr(&udsp->udsp_rte, "RTE", &bulk, &bulk_size);
+       if (rc < 0)
+               goto free_udsp;
+
+       return lnet_udsp_add_policy(udsp, idx);
+
+free_udsp:
+       lnet_udsp_free(udsp);
+       return rc;
+}
index 642ef2f..4e339cf 100644 (file)
@@ -29,7 +29,8 @@
 lib_LTLIBRARIES = liblnetconfig.la
 
 liblnetconfig_la_SOURCES  = liblnetconfig.c liblnetconfig.h \
-                           liblnetconfig_lnd.c liblnd.h cyaml.c cyaml.h
+                           liblnetconfig_lnd.c liblnd.h cyaml.c cyaml.h \
+                           liblnetconfig_udsp.c
 liblnetconfig_la_CPPFLAGS = -D_LARGEFILE64_SOURCE=1 -D_FILE_OFFSET_BITS=64 \
                            -DLUSTRE_UTILS=1 -fPIC
 liblnetconfig_la_LDFLAGS = -L$(top_builddir)/libcfs/libcfs -lyaml -lm \
index a5a3eeb..a4e446a 100644 (file)
 #include <glob.h>
 #include <libcfs/util/param.h>
 
-#define CONFIG_CMD             "configure"
-#define UNCONFIG_CMD           "unconfigure"
-#define ADD_CMD                        "add"
-#define DEL_CMD                        "del"
-#define SHOW_CMD               "show"
-#define DBG_CMD                        "dbg"
-#define MANAGE_CMD             "manage"
-
-#define MAX_NUM_IPS            128
-
-#define modparam_path "/sys/module/lnet/parameters/"
-#define o2ib_modparam_path "/sys/module/ko2iblnd/parameters/"
-#define gni_nid_path "/proc/cray_xt/"
-
 #ifndef HAVE_USRSPC_RDMA_PS_TCP
 #define RDMA_PS_TCP 0x0106
 #endif
@@ -1893,6 +1879,114 @@ get_counts(struct lnet_ioctl_element_msg_stats *msg_stats, int idx)
        return NULL;
 }
 
+static int
+create_local_udsp_info(struct lnet_ioctl_construct_udsp_info *udsp_info,
+                      struct cYAML *net_node)
+{
+       char tmp[LNET_MAX_STR_LEN];
+       struct cYAML *udsp_net;
+       bool created = false;
+       struct cYAML *pref;
+       int i;
+
+       /* add the UDSP info */
+       udsp_net = cYAML_create_object(net_node, "udsp info");
+       if (!udsp_net)
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       if (!cYAML_create_number(udsp_net, "net priority",
+                                (int) udsp_info->cud_net_priority))
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       if (!cYAML_create_number(udsp_net, "nid priority",
+                                (int)udsp_info->cud_nid_priority))
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       pref = udsp_net;
+
+       for (i = 0; i < LNET_MAX_SHOW_NUM_NID; i++) {
+               memset(tmp, 0, LNET_MAX_STR_LEN);
+               if (udsp_info->cud_pref_rtr_nid[i] == 0)
+                       break;
+               if (!created) {
+                       pref = cYAML_create_object(udsp_net,
+                                       "Preferred gateway NIDs");
+                       if (!pref)
+                               return LUSTRE_CFG_RC_OUT_OF_MEM;
+                       created = true;
+               }
+               snprintf(tmp, sizeof(tmp), "NID-%d", i);
+               if (!cYAML_create_string(pref, tmp,
+                       libcfs_nid2str(udsp_info->cud_pref_rtr_nid[i])))
+                       return LUSTRE_CFG_RC_OUT_OF_MEM;
+       }
+
+       return LUSTRE_CFG_RC_NO_ERR;
+}
+
+static int
+create_remote_udsp_info(struct lnet_ioctl_construct_udsp_info *udsp_info,
+                       struct cYAML *nid_node)
+{
+       char tmp[LNET_MAX_STR_LEN];
+       struct cYAML *udsp_nid;
+       bool created = false;
+       struct cYAML *pref;
+       int i;
+
+       /* add the UDSP info */
+       udsp_nid = cYAML_create_object(nid_node, "udsp info");
+       if (!udsp_nid)
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       if (!cYAML_create_number(udsp_nid, "net priority",
+                                (int) udsp_info->cud_net_priority))
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       if (!cYAML_create_number(udsp_nid, "nid priority",
+                                (int) udsp_info->cud_nid_priority))
+               return LUSTRE_CFG_RC_OUT_OF_MEM;
+
+       pref = udsp_nid;
+       for (i = 0; i < LNET_MAX_SHOW_NUM_NID; i++) {
+               memset(tmp, 0, LNET_MAX_STR_LEN);
+               if (udsp_info->cud_pref_rtr_nid[i] == 0)
+                       break;
+               if (!created) {
+                       pref = cYAML_create_object(udsp_nid,
+                                       "Preferred gateway NIDs");
+                       if (!pref)
+                               return LUSTRE_CFG_RC_OUT_OF_MEM;
+                       created = true;
+               }
+               snprintf(tmp, sizeof(tmp), "NID-%d", i);
+               if (!cYAML_create_string(pref, tmp,
+                       libcfs_nid2str(udsp_info->cud_pref_rtr_nid[i])))
+                       return LUSTRE_CFG_RC_OUT_OF_MEM;
+       }
+
+       pref = udsp_nid;
+       created = false;
+       for (i = 0; i < LNET_MAX_SHOW_NUM_NID; i++) {
+               memset(tmp, 0, LNET_MAX_STR_LEN);
+               if (udsp_info->cud_pref_nid[i] == 0)
+                       break;
+               if (!created) {
+                       pref = cYAML_create_object(udsp_nid,
+                                       "Preferred source NIDs");
+                       if (!pref)
+                               return LUSTRE_CFG_RC_OUT_OF_MEM;
+                       created = true;
+               }
+               snprintf(tmp, sizeof(tmp), "NID-%d", i);
+               if (!cYAML_create_string(pref, tmp,
+                       libcfs_nid2str(udsp_info->cud_pref_nid[i])))
+                       return LUSTRE_CFG_RC_OUT_OF_MEM;
+       }
+
+       return LUSTRE_CFG_RC_NO_ERR;
+}
+
 int lustre_lnet_show_net(char *nw, int detail, int seq_no,
                         struct cYAML **show_rc, struct cYAML **err_rc,
                         bool backup)
@@ -1903,6 +1997,7 @@ int lustre_lnet_show_net(char *nw, int detail, int seq_no,
        struct lnet_ioctl_element_stats *stats;
        struct lnet_ioctl_element_msg_stats msg_stats;
        struct lnet_ioctl_local_ni_hstats hstats;
+       struct lnet_ioctl_construct_udsp_info udsp_info;
        __u32 net = LNET_NET_ANY;
        __u32 prev_net = LNET_NET_ANY;
        int rc = LUSTRE_CFG_RC_OUT_OF_MEM, i, j;
@@ -2067,6 +2162,27 @@ int lustre_lnet_show_net(char *nw, int detail, int seq_no,
                                                        == NULL)
                                goto out;
 
+                       if (detail < 4)
+                               goto continue_without_udsp_info;
+
+                       LIBCFS_IOC_INIT_V2(udsp_info, cud_hdr);
+                       udsp_info.cud_nid = ni_data->lic_nid;
+                       udsp_info.cud_peer = false;
+                       rc = l_ioctl(LNET_DEV_ID,
+                                    IOC_LIBCFS_GET_CONST_UDSP_INFO,
+                                    &udsp_info);
+                       if (rc != 0) {
+                               l_errno = errno;
+                               goto continue_without_udsp_info;
+                       }
+
+                       rc = create_local_udsp_info(&udsp_info, item);
+                       if (rc) {
+                               l_errno = errno;
+                               goto out;
+                       }
+
+continue_without_udsp_info:
                        if (detail < 2)
                                goto continue_without_msg_stats;
 
@@ -2744,6 +2860,7 @@ int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
        struct lnet_ioctl_element_stats *lpni_stats;
        struct lnet_ioctl_element_msg_stats *msg_stats;
        struct lnet_ioctl_peer_ni_hstats *hstats;
+       struct lnet_ioctl_construct_udsp_info udsp_info;
        lnet_nid_t *nidp;
        int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
        int i, j, k;
@@ -2905,6 +3022,27 @@ int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
                        if (backup)
                                continue;
 
+                       if (detail < 4)
+                               goto continue_without_udsp_info;
+
+                       LIBCFS_IOC_INIT_V2(udsp_info, cud_hdr);
+                       udsp_info.cud_nid = *nidp;
+                       udsp_info.cud_peer = true;
+                       rc = l_ioctl(LNET_DEV_ID,
+                                       IOC_LIBCFS_GET_CONST_UDSP_INFO,
+                                       &udsp_info);
+                       if (rc != 0) {
+                               l_errno = errno;
+                               goto continue_without_udsp_info;
+                       }
+
+                       rc = create_remote_udsp_info(&udsp_info, peer_ni);
+                       if (rc) {
+                               l_errno = errno;
+                               goto out;
+                       }
+
+continue_without_udsp_info:
                        if (cYAML_create_string(peer_ni, "state",
                                                lpni_cri->cr_aliveness)
                            == NULL)
@@ -4548,6 +4686,58 @@ static int handle_yaml_show_numa(struct cYAML *tree, struct cYAML **show_rc,
                                           show_rc, err_rc);
 }
 
+static int handle_yaml_del_udsp(struct cYAML *tree, struct cYAML **show_rc,
+                               struct cYAML **err_rc)
+{
+       struct cYAML *seq_no, *idx;
+
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+       idx = cYAML_get_object_item(tree, "idx");
+
+       return lustre_lnet_del_udsp(idx ? idx->cy_valueint : -1,
+                                   seq_no ? seq_no->cy_valueint : -1,
+                                   err_rc);
+}
+
+static int handle_yaml_config_udsp(struct cYAML *tree, struct cYAML **show_rc,
+                                  struct cYAML **err_rc)
+{
+       struct cYAML *seq_no, *src, *rte, *dst, *prio, *idx;
+       union lnet_udsp_action action;
+
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+       src = cYAML_get_object_item(tree, "src");
+       rte = cYAML_get_object_item(tree, "rte");
+       dst = cYAML_get_object_item(tree, "dst");
+       prio = cYAML_get_object_item(tree, "priority");
+       idx = cYAML_get_object_item(tree, "idx");
+
+       action.udsp_priority = prio ? prio->cy_valueint : -1;
+
+       return lustre_lnet_add_udsp(src ? src->cy_valuestring : NULL,
+                                   dst ? dst->cy_valuestring : NULL,
+                                   rte ? rte->cy_valuestring : NULL,
+                                   prio ? "priority" : "",
+                                   &action,
+                                   idx ? idx->cy_valueint : -1,
+                                   seq_no ? seq_no->cy_valueint : -1,
+                                   err_rc);
+}
+
+static int handle_yaml_show_udsp(struct cYAML *tree, struct cYAML **show_rc,
+                                struct cYAML **err_rc)
+{
+       struct cYAML *seq_no;
+       struct cYAML *idx;
+
+       seq_no = cYAML_get_object_item(tree, "seq_no");
+       idx = cYAML_get_object_item(tree, "idx");
+
+       return lustre_lnet_show_udsp(idx ? idx->cy_valueint : -1,
+                                    seq_no ? seq_no->cy_valueint : -1,
+                                    show_rc, err_rc);
+}
+
 static int handle_yaml_config_global_settings(struct cYAML *tree,
                                              struct cYAML **show_rc,
                                              struct cYAML **err_rc)
@@ -4810,6 +5000,7 @@ static struct lookup_cmd_hdlr_tbl lookup_config_tbl[] = {
        { .name = "numa",       .cb = handle_yaml_config_numa },
        { .name = "ping",       .cb = handle_yaml_no_op },
        { .name = "discover",   .cb = handle_yaml_no_op },
+       { .name = "udsp",       .cb = handle_yaml_config_udsp },
        { .name = NULL } };
 
 static struct lookup_cmd_hdlr_tbl lookup_del_tbl[] = {
@@ -4824,6 +5015,7 @@ static struct lookup_cmd_hdlr_tbl lookup_del_tbl[] = {
        { .name = "numa",       .cb = handle_yaml_del_numa },
        { .name = "ping",       .cb = handle_yaml_no_op },
        { .name = "discover",   .cb = handle_yaml_no_op },
+       { .name = "udsp",       .cb = handle_yaml_del_udsp },
        { .name = NULL } };
 
 static struct lookup_cmd_hdlr_tbl lookup_show_tbl[] = {
@@ -4838,6 +5030,7 @@ static struct lookup_cmd_hdlr_tbl lookup_show_tbl[] = {
        { .name = "numa",       .cb = handle_yaml_show_numa },
        { .name = "ping",       .cb = handle_yaml_no_op },
        { .name = "discover",   .cb = handle_yaml_no_op },
+       { .name = "udsp",       .cb = handle_yaml_show_udsp },
        { .name = NULL } };
 
 static struct lookup_cmd_hdlr_tbl lookup_exec_tbl[] = {
index fe9ce5d..3124c74 100644 (file)
 #define LUSTRE_CFG_RC_MATCH                    -7
 #define LUSTRE_CFG_RC_SKIP                     -8
 #define LUSTRE_CFG_RC_LAST_ELEM                        -9
+#define LUSTRE_CFG_RC_MARSHAL_FAIL             -10
+
+#define CONFIG_CMD             "configure"
+#define UNCONFIG_CMD           "unconfigure"
+#define ADD_CMD                        "add"
+#define DEL_CMD                        "del"
+#define SHOW_CMD               "show"
+#define DBG_CMD                        "dbg"
+#define MANAGE_CMD             "manage"
+
+#define MAX_NUM_IPS            128
+
+#define modparam_path "/sys/module/lnet/parameters/"
+#define o2ib_modparam_path "/sys/module/ko2iblnd/parameters/"
+#define gni_nid_path "/proc/cray_xt/"
 
 enum lnetctl_cmd {
        LNETCTL_CONFIG_CMD      = 1,
@@ -72,6 +87,57 @@ struct lnet_dlc_intf_descr {
        struct cfs_expr_list *cpt_expr;
 };
 
+/* This UDSP structures need to match the kernel space structures
+ * in order for the marshall and unmarshall functions to be the same.
+ */
+
+/* Net is described as a
+ *  1. net type
+ *  2. num range
+ */
+struct lnet_ud_net_descr {
+       __u32 udn_net_type;
+       struct list_head udn_net_num_range;
+};
+
+/* each NID range is defined as
+ *  1. net descriptor
+ *  2. address range descriptor
+ */
+struct lnet_ud_nid_descr {
+       struct lnet_ud_net_descr ud_net_id;
+       struct list_head ud_addr_range;
+};
+
+/* a UDSP rule can have up to three user defined NID descriptors
+ *     - src: defines the local NID range for the rule
+ *     - dst: defines the peer NID range for the rule
+ *     - rte: defines the router NID range for the rule
+ *
+ * An action union defines the action to take when the rule
+ * is matched
+ */
+struct lnet_udsp {
+       struct list_head udsp_on_list;
+       __u32 udsp_idx;
+       struct lnet_ud_nid_descr udsp_src;
+       struct lnet_ud_nid_descr udsp_dst;
+       struct lnet_ud_nid_descr udsp_rte;
+       enum lnet_udsp_action_type udsp_action_type;
+       union {
+               __u32 udsp_priority;
+       } udsp_action;
+};
+
+/* This union is passed from lnetctl to fill the action union in udsp
+ * structure
+ * TODO: The idea here is if we add extra actions, ex: drop, it can be
+ * added to the union
+ */
+union lnet_udsp_action {
+       int udsp_priority;
+};
+
 /* forward declaration of the cYAML structure. */
 struct cYAML;
 
@@ -704,4 +770,40 @@ int lustre_lnet_parse_interfaces(char *intf_str,
 int lustre_lnet_parse_nidstr(char *nidstr, lnet_nid_t *lnet_nidlist,
                             int max_nids, char *err_str);
 
+/* lustre_lnet_add_udsp
+ *     Add a selection policy.
+ *     src - source NID descriptor
+ *     dst - destination NID descriptor
+ *     rte - router NID descriptor
+ *     type - action type
+ *     action - union of the action
+ *     idx - the index to delete
+ *     seq_no - sequence number of the request
+ *     err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ *               caller
+ */
+int lustre_lnet_add_udsp(char *src, char *dst, char *rte, char *type,
+                        union lnet_udsp_action *action, int idx,
+                        int seq_no, struct cYAML **err_rc);
+
+/* lustre_lnet_del_udsp
+ *     Delete a net selection policy.
+ *     idx - the index to delete
+ *     seq_no - sequence number of the request
+ *     err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ *     caller
+ */
+int lustre_lnet_del_udsp(unsigned int idx, int seq_no, struct cYAML **err_rc);
+
+/* lustre_lnet_show_udsp
+ *     show selection policy.
+ *     idx - the index to show. -1 to show all policies
+ *     seq_no - sequence number of the request
+ *     err_rc - [IN/OUT] struct cYAML tree containing udsp info
+ *     err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ *     caller
+ */
+int lustre_lnet_show_udsp(int idx, int seq_no, struct cYAML **show_rc,
+                         struct cYAML **err_rc);
+
 #endif /* LIB_LNET_CONFIG_API_H */
diff --git a/lnet/utils/lnetconfig/liblnetconfig_udsp.c b/lnet/utils/lnetconfig/liblnetconfig_udsp.c
new file mode 100644 (file)
index 0000000..045c056
--- /dev/null
@@ -0,0 +1,859 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2011, 2017, Intel Corporation.
+ *
+ * Copyright (c) 2018-2020 Data Direct Networks.
+ *
+ *   This file is part of Lustre, https://wiki.whamcloud.com/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   version 2 along with this program; If not, see
+ *   http://www.gnu.org/licenses/gpl-2.0.html
+ *
+ * Author: Sonia Sharma
+ */
+/*
+ * Copyright (c) 2020, Whamcloud.
+ *
+ */
+
+#include <errno.h>
+#include <limits.h>
+#include <byteswap.h>
+#include <netdb.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <libcfs/util/ioctl.h>
+#include <linux/lnet/lnetctl.h>
+#include "liblnd.h"
+#include <sys/types.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <linux/lnet/lnet-dlc.h>
+#include "liblnetconfig.h"
+
+static inline bool
+lnet_udsp_criteria_present(struct lnet_ud_nid_descr *descr)
+{
+       return descr->ud_net_id.udn_net_type != 0;
+}
+
+struct lnet_udsp *lnet_udsp_alloc(void)
+{
+       struct lnet_udsp *udsp;
+
+       udsp = calloc(1, sizeof(*udsp));
+
+       if (!udsp)
+               return NULL;
+
+       INIT_LIST_HEAD(&udsp->udsp_on_list);
+       INIT_LIST_HEAD(&udsp->udsp_src.ud_addr_range);
+       INIT_LIST_HEAD(&udsp->udsp_src.ud_net_id.udn_net_num_range);
+       INIT_LIST_HEAD(&udsp->udsp_dst.ud_addr_range);
+       INIT_LIST_HEAD(&udsp->udsp_dst.ud_net_id.udn_net_num_range);
+       INIT_LIST_HEAD(&udsp->udsp_rte.ud_addr_range);
+       INIT_LIST_HEAD(&udsp->udsp_rte.ud_net_id.udn_net_num_range);
+
+       return udsp;
+}
+
+static void
+lnet_udsp_nid_descr_free(struct lnet_ud_nid_descr *nid_descr, bool blk)
+{
+       struct list_head *net_range = &nid_descr->ud_net_id.udn_net_num_range;
+
+       if (!lnet_udsp_criteria_present(nid_descr))
+               return;
+
+       /* memory management is a bit tricky here. When we allocate the
+        * memory to store the NID descriptor we allocate a large buffer
+        * for all the data, so we need to free the entire buffer at
+        * once. If the net is present the net_range->next points to that
+        * buffer otherwise if the ud_addr_range is present then it's the
+        * ud_addr_range.next
+        */
+       if (blk) {
+               if (!list_empty(net_range))
+                       free(net_range->next);
+               else if (!list_empty(&nid_descr->ud_addr_range))
+                       free(nid_descr->ud_addr_range.next);
+       } else {
+               cfs_expr_list_free_list(net_range);
+               cfs_expr_list_free_list(&nid_descr->ud_addr_range);
+       }
+}
+
+void
+lnet_udsp_free(struct lnet_udsp *udsp, bool blk)
+{
+       lnet_udsp_nid_descr_free(&udsp->udsp_src, blk);
+       lnet_udsp_nid_descr_free(&udsp->udsp_dst, blk);
+       lnet_udsp_nid_descr_free(&udsp->udsp_rte, blk);
+
+       free(udsp);
+}
+
+static void
+copy_range_info(void __user **bulk, void **buf, struct list_head *list,
+               int count)
+{
+       struct lnet_range_expr *range_expr;
+       struct cfs_range_expr *range;
+       struct cfs_expr_list *exprs;
+       int range_count = count;
+       int i;
+
+       if (range_count == 0)
+               return;
+
+       if (range_count == -1) {
+               struct lnet_expressions *e;
+
+               e = *bulk;
+               range_count = e->le_count;
+               *bulk += sizeof(*e);
+       }
+
+       exprs = *buf;
+       INIT_LIST_HEAD(&exprs->el_link);
+       INIT_LIST_HEAD(&exprs->el_exprs);
+       list_add_tail(&exprs->el_link, list);
+       *buf += sizeof(*exprs);
+
+       for (i = 0; i < range_count; i++) {
+               range_expr = *bulk;
+               range = *buf;
+               INIT_LIST_HEAD(&range->re_link);
+               range->re_lo = range_expr->re_lo;
+               range->re_hi = range_expr->re_hi;
+               range->re_stride = range_expr->re_stride;
+               list_add_tail(&range->re_link, &exprs->el_exprs);
+               *bulk += sizeof(*range_expr);
+               *buf += sizeof(*range);
+       }
+}
+
+static int
+copy_ioc_udsp_descr(struct lnet_ud_nid_descr *nid_descr, char *type,
+                   void **bulk, __u32 *bulk_size)
+{
+       struct lnet_ioctl_udsp_descr *ioc_nid = *bulk;
+       struct lnet_expressions *exprs;
+       __u32 descr_type;
+       int expr_count = 0;
+       int range_count = 0;
+       int i;
+       __u32 size;
+       int remaining_size = *bulk_size;
+       void *tmp = *bulk;
+       __u32 alloc_size;
+       void *buf;
+       size_t range_expr_s = sizeof(struct lnet_range_expr);
+       size_t lnet_exprs_s = sizeof(struct lnet_expressions);
+
+       /* criteria not present, skip over the static part of the
+        * bulk, which is included for each NID descriptor
+        */
+       if (ioc_nid->iud_net.ud_net_type == 0) {
+               remaining_size -= sizeof(*ioc_nid);
+               if (remaining_size < 0)
+                       return -EINVAL;
+               *bulk += sizeof(*ioc_nid);
+               *bulk_size = remaining_size;
+               return 0;
+       }
+
+       descr_type = ioc_nid->iud_src_hdr.ud_descr_type;
+       if (descr_type != *(__u32 *)type)
+               return -EINVAL;
+
+       /* calculate the total size to verify we have enough buffer.
+        * Start of by finding how many ranges there are for the net
+        * expression.
+        */
+       range_count = ioc_nid->iud_net.ud_net_num_expr.le_count;
+       size = sizeof(*ioc_nid) + (range_count * range_expr_s);
+       remaining_size -= size;
+       if (remaining_size < 0)
+               return -EINVAL;
+
+       /* the number of expressions for the NID. IE 4 for IP, 1 for GNI */
+       expr_count = ioc_nid->iud_src_hdr.ud_descr_count;
+       /* point tmp to the beginning of the NID expressions */
+       tmp += size;
+       for (i = 0; i < expr_count; i++) {
+               /* get the number of ranges per expression */
+               exprs = tmp;
+               range_count += exprs->le_count;
+               size = (range_expr_s * exprs->le_count) + lnet_exprs_s;
+               remaining_size -= size;
+               if (remaining_size < 0)
+                       return -EINVAL;
+               tmp += size;
+       }
+
+       *bulk_size = remaining_size;
+
+       /* copy over the net type */
+       nid_descr->ud_net_id.udn_net_type = ioc_nid->iud_net.ud_net_type;
+
+       /* allocate the total memory required to copy this NID descriptor */
+       alloc_size = (sizeof(struct cfs_expr_list) * (expr_count + 1)) +
+                    (sizeof(struct cfs_range_expr) * (range_count));
+       buf = calloc(alloc_size, 1);
+       if (!buf)
+               return -ENOMEM;
+
+       /* copy over the net number range */
+       range_count = ioc_nid->iud_net.ud_net_num_expr.le_count;
+       *bulk += sizeof(*ioc_nid);
+       copy_range_info(bulk, &buf, &nid_descr->ud_net_id.udn_net_num_range,
+                       range_count);
+
+       /* copy over the NID descriptor */
+       for (i = 0; i < expr_count; i++)
+               copy_range_info(bulk, &buf, &nid_descr->ud_addr_range, -1);
+
+       return 0;
+}
+
+struct lnet_udsp *
+lnet_udsp_demarshal(void *bulk, __u32 bulk_size)
+{
+       struct lnet_ioctl_udsp *ioc_udsp;
+       struct lnet_udsp *udsp;
+       int rc = -ENOMEM;
+
+       if (bulk_size < sizeof(*ioc_udsp))
+               return NULL;
+
+       udsp = lnet_udsp_alloc();
+       if (!udsp)
+               return NULL;
+
+       ioc_udsp = bulk;
+
+       udsp->udsp_action_type = ioc_udsp->iou_action_type;
+       udsp->udsp_action.udsp_priority = ioc_udsp->iou_action.priority;
+       udsp->udsp_idx = ioc_udsp->iou_idx;
+
+       bulk = ioc_udsp->iou_bulk;
+       bulk_size -= sizeof(*ioc_udsp);
+
+       if (bulk_size != ioc_udsp->iou_bulk_size)
+               goto failed;
+
+       rc = copy_ioc_udsp_descr(&udsp->udsp_src, "SRC", &bulk, &bulk_size);
+       if (rc < 0)
+               goto failed;
+
+       rc = copy_ioc_udsp_descr(&udsp->udsp_dst, "DST", &bulk, &bulk_size);
+       if (rc < 0)
+               goto failed;
+
+       rc = copy_ioc_udsp_descr(&udsp->udsp_rte, "RTE", &bulk, &bulk_size);
+       if (rc < 0)
+               goto failed;
+
+       return udsp;
+
+failed:
+       lnet_udsp_free(udsp, true);
+       return NULL;
+}
+
+static inline int
+lnet_get_list_len(struct list_head *list)
+{
+       struct list_head *l;
+       int count = 0;
+
+       list_for_each(l, list)
+               count++;
+
+       return count;
+}
+
+static size_t
+lnet_size_marshaled_nid_descr(struct lnet_ud_nid_descr *descr)
+{
+       struct cfs_expr_list *expr;
+       int expr_count = 0;
+       int range_count = 0;
+       size_t size = sizeof(struct lnet_ioctl_udsp_descr);
+
+       if (!lnet_udsp_criteria_present(descr))
+               return size;
+
+       if (!list_empty(&descr->ud_net_id.udn_net_num_range)) {
+               expr = list_entry(descr->ud_net_id.udn_net_num_range.next,
+                                 struct cfs_expr_list, el_link);
+               range_count = lnet_get_list_len(&expr->el_exprs);
+       }
+
+       /* count the number of cfs_range_expr in the address expressions */
+       list_for_each_entry(expr, &descr->ud_addr_range, el_link) {
+               expr_count++;
+               range_count += lnet_get_list_len(&expr->el_exprs);
+       }
+
+       size += (sizeof(struct lnet_expressions) * expr_count);
+       size += (sizeof(struct lnet_range_expr) * range_count);
+
+       return size;
+}
+
+size_t
+lnet_get_udsp_size(struct lnet_udsp *udsp)
+{
+       size_t size = sizeof(struct lnet_ioctl_udsp);
+
+       size += lnet_size_marshaled_nid_descr(&udsp->udsp_src);
+       size += lnet_size_marshaled_nid_descr(&udsp->udsp_dst);
+       size += lnet_size_marshaled_nid_descr(&udsp->udsp_rte);
+
+       return size;
+}
+
+static void
+copy_exprs(struct cfs_expr_list *expr, void __user **bulk,
+          __s32 *bulk_size)
+{
+       struct cfs_range_expr *range;
+       struct lnet_range_expr range_expr;
+
+       /* copy over the net range expressions to the bulk */
+       list_for_each_entry(range, &expr->el_exprs, re_link) {
+               range_expr.re_lo = range->re_lo;
+               range_expr.re_hi = range->re_hi;
+               range_expr.re_stride = range->re_stride;
+               memcpy(*bulk, &range_expr, sizeof(range_expr));
+               *bulk += sizeof(range_expr);
+               *bulk_size -= sizeof(range_expr);
+       }
+}
+
+static int
+copy_nid_range(struct lnet_ud_nid_descr *nid_descr, char *type,
+               void __user **bulk, __s32 *bulk_size)
+{
+       struct lnet_ioctl_udsp_descr ioc_udsp_descr = { { 0 } };
+       struct cfs_expr_list *expr;
+       struct lnet_expressions ioc_expr;
+       int expr_count;
+       int net_expr_count = 0;
+
+       ioc_udsp_descr.iud_src_hdr.ud_descr_type = *(__u32 *)type;
+
+       /* if criteria not present, copy over the static part of the NID
+        * descriptor
+        */
+       if (!lnet_udsp_criteria_present(nid_descr)) {
+               memcpy(*bulk, &ioc_udsp_descr,
+                       sizeof(ioc_udsp_descr));
+               *bulk += sizeof(ioc_udsp_descr);
+               *bulk_size -= sizeof(ioc_udsp_descr);
+               return 0;
+       }
+
+       expr_count = lnet_get_list_len(&nid_descr->ud_addr_range);
+
+       /* copy the net information */
+       if (!list_empty(&nid_descr->ud_net_id.udn_net_num_range)) {
+               expr = list_entry(nid_descr->ud_net_id.udn_net_num_range.next,
+                                 struct cfs_expr_list, el_link);
+               net_expr_count = lnet_get_list_len(&expr->el_exprs);
+       } else {
+               net_expr_count = 0;
+       }
+
+       /* set the total expression count */
+       ioc_udsp_descr.iud_src_hdr.ud_descr_count = expr_count;
+       ioc_udsp_descr.iud_net.ud_net_type =
+               nid_descr->ud_net_id.udn_net_type;
+       ioc_udsp_descr.iud_net.ud_net_num_expr.le_count = net_expr_count;
+
+       /* copy over the header info to the bulk */
+       memcpy(*bulk, &ioc_udsp_descr, sizeof(ioc_udsp_descr));
+       *bulk += sizeof(ioc_udsp_descr);
+       *bulk_size -= sizeof(ioc_udsp_descr);
+
+       /* copy over the net num expression if it exists */
+       if (net_expr_count)
+               copy_exprs(expr, bulk, bulk_size);
+
+       /* copy the address range */
+       list_for_each_entry(expr, &nid_descr->ud_addr_range, el_link) {
+               ioc_expr.le_count = lnet_get_list_len(&expr->el_exprs);
+               memcpy(*bulk, &ioc_expr, sizeof(ioc_expr));
+               *bulk += sizeof(ioc_expr);
+               *bulk_size -= sizeof(ioc_expr);
+
+               copy_exprs(expr, bulk, bulk_size);
+       }
+
+       return 0;
+}
+
+static int
+lnet_udsp_marshal(struct lnet_udsp *udsp, void *bulk,
+                 __s32 bulk_size)
+{
+       struct lnet_ioctl_udsp *ioc_udsp;
+       int rc = -ENOMEM;
+
+       /* make sure user space allocated enough buffer to marshal the
+        * udsp
+        */
+       if (bulk_size < lnet_get_udsp_size(udsp))
+               return -EINVAL;
+
+       ioc_udsp = bulk;
+
+       ioc_udsp->iou_idx = udsp->udsp_idx;
+       ioc_udsp->iou_action_type = udsp->udsp_action_type;
+       ioc_udsp->iou_action.priority = udsp->udsp_action.udsp_priority;
+
+       bulk += sizeof(*ioc_udsp);
+       bulk_size -= sizeof(*ioc_udsp);
+
+       rc = copy_nid_range(&udsp->udsp_src, "SRC", &bulk, &bulk_size);
+       if (rc != 0)
+               return rc;
+
+       rc = copy_nid_range(&udsp->udsp_dst, "DST", &bulk, &bulk_size);
+       if (rc != 0)
+               return rc;
+
+       rc = copy_nid_range(&udsp->udsp_rte, "RTE", &bulk, &bulk_size);
+
+       return rc;
+}
+
+static enum lnet_udsp_action_type
+lnet_str2udsp_action(char *type)
+{
+       if (!type)
+               return EN_LNET_UDSP_ACTION_NONE;
+
+       if (!strncmp(type, "priority", strlen("priority")))
+               return EN_LNET_UDSP_ACTION_PRIORITY;
+
+       if (!strncmp(type, "pref", strlen("pref")))
+               return EN_LNET_UDSP_ACTION_PREFERRED_LIST;
+
+       return EN_LNET_UDSP_ACTION_NONE;
+}
+
+int lustre_lnet_add_udsp(char *src, char *dst, char *rte,
+                        char *type, union lnet_udsp_action *action,
+                        int idx, int seq_no, struct cYAML **err_rc)
+{
+       struct lnet_udsp *udsp = NULL;
+       struct lnet_ioctl_udsp *udsp_bulk;
+       int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+       void *bulk = NULL;
+       __u32 bulk_size;
+       char err_str[LNET_MAX_STR_LEN];
+       enum lnet_udsp_action_type action_type;
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       action_type = lnet_str2udsp_action(type);
+       if (action_type == EN_LNET_UDSP_ACTION_NONE) {
+               snprintf(err_str, sizeof(err_str),
+                        "\"bad action type specified: %s\"", type);
+               rc = LUSTRE_CFG_RC_BAD_PARAM;
+               goto out;
+       }
+
+       /* sanitize parameters:
+        * src-dst can be simultaneously present
+        * dst-rte can be simultaneously present
+        */
+       if ((!src && !rte && !dst) ||
+           (src && rte && dst) ||
+           (src && rte && !dst)) {
+               snprintf(err_str, sizeof(err_str),
+                 "\"The combination of src, dst and rte is not supported\"");
+               rc = LUSTRE_CFG_RC_BAD_PARAM;
+               goto out;
+       }
+
+       udsp = lnet_udsp_alloc();
+       if (!udsp) {
+               snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+               goto out;
+       }
+
+       udsp->udsp_idx = idx;
+       udsp->udsp_action_type = action_type;
+
+       /* a priority of -1 will result in the lowest possible priority */
+       if (action_type == EN_LNET_UDSP_ACTION_PRIORITY)
+               udsp->udsp_action.udsp_priority = action->udsp_priority;
+
+        /* override with the default
+         * if priority is expected, but not specified
+         */
+       if (!rte && ((dst && !src) || (src && !dst)) &&
+            action_type != EN_LNET_UDSP_ACTION_PRIORITY) {
+               udsp->udsp_action_type = EN_LNET_UDSP_ACTION_PRIORITY;
+               udsp->udsp_action.udsp_priority = 0;
+       }
+
+       if (src) {
+               rc = cfs_parse_nid_parts(src, &udsp->udsp_src.ud_addr_range,
+                               &udsp->udsp_src.ud_net_id.udn_net_num_range,
+                               &udsp->udsp_src.ud_net_id.udn_net_type);
+               if (rc < 0) {
+                       snprintf(err_str,
+                                sizeof(err_str),
+                                "\failed to parse src parameter\"");
+                       goto out;
+               }
+       }
+       if (dst) {
+               rc = cfs_parse_nid_parts(dst, &udsp->udsp_dst.ud_addr_range,
+                               &udsp->udsp_dst.ud_net_id.udn_net_num_range,
+                               &udsp->udsp_dst.ud_net_id.udn_net_type);
+               if (rc < 0) {
+                       snprintf(err_str,
+                                sizeof(err_str),
+                                "\failed to parse dst parameter\"");
+                       goto out;
+               }
+       }
+       if (rte) {
+               rc = cfs_parse_nid_parts(rte, &udsp->udsp_rte.ud_addr_range,
+                               &udsp->udsp_rte.ud_net_id.udn_net_num_range,
+                               &udsp->udsp_rte.ud_net_id.udn_net_type);
+               if (rc < 0) {
+                       snprintf(err_str,
+                                sizeof(err_str),
+                                "\failed to parse rte parameter\"");
+                       goto out;
+               }
+       }
+
+       bulk_size = lnet_get_udsp_size(udsp);
+       bulk = calloc(1, bulk_size);
+       if (!bulk) {
+               rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+               snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+               goto out;
+       }
+
+       udsp_bulk = bulk;
+       LIBCFS_IOC_INIT_V2(*udsp_bulk, iou_hdr);
+       udsp_bulk->iou_hdr.ioc_len = bulk_size;
+       udsp_bulk->iou_bulk_size = bulk_size - sizeof(*udsp_bulk);
+
+       rc = lnet_udsp_marshal(udsp, bulk, bulk_size);
+       if (rc != LUSTRE_CFG_RC_NO_ERR) {
+               rc = LUSTRE_CFG_RC_MARSHAL_FAIL;
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"failed to marshal udsp\"");
+               goto out;
+       }
+
+       udsp_bulk->iou_bulk = bulk + sizeof(*udsp_bulk);
+
+       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_ADD_UDSP, bulk);
+       if (rc < 0) {
+               rc = errno;
+               snprintf(err_str, sizeof(err_str),
+                        "\"cannot add udsp: %s\"", strerror(errno));
+               goto out;
+       }
+
+       rc = LUSTRE_CFG_RC_NO_ERR;
+
+out:
+       if (bulk)
+               free(bulk);
+       if (udsp)
+               lnet_udsp_free(udsp, false);
+       cYAML_build_error(rc, seq_no, ADD_CMD, "udsp", err_str, err_rc);
+       return rc;
+}
+
+int lustre_lnet_del_udsp(unsigned int idx, int seq_no, struct cYAML **err_rc)
+{
+       int rc;
+       char err_str[LNET_MAX_STR_LEN];
+       struct lnet_ioctl_udsp udsp_bulk;
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       LIBCFS_IOC_INIT_V2(udsp_bulk, iou_hdr);
+       udsp_bulk.iou_idx = idx;
+
+       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DEL_UDSP, &udsp_bulk);
+       if (rc < 0) {
+               rc = -errno;
+               snprintf(err_str, sizeof(err_str),
+                        "\"cannot del udsp: %s\"", strerror(rc));
+       }
+
+       cYAML_build_error(rc, seq_no, ADD_CMD, "udsp", err_str, err_rc);
+       return rc;
+}
+
+int lustre_lnet_nid_descr2str(struct lnet_ud_nid_descr *d,
+                                    char *str, size_t size)
+{
+       int left = size;
+       int len;
+       char *net;
+       bool addr_found = false;
+
+       /* criteria not defined */
+       if (d->ud_net_id.udn_net_type == 0) {
+               strncat(str, "NA", left - 1);
+               return 0;
+       }
+
+       left = cfs_expr2str(&d->ud_addr_range, str, left);
+       if (left < 0)
+               return left;
+       net = libcfs_net2str(LNET_MKNET(d->ud_net_id.udn_net_type, 0));
+       if (left < size) {
+               len = strlen(net) + 2; /* account for @ and NULL termination */
+               addr_found = true;
+       } else {
+               len = strlen(net) + 1; /* account for NULL termination */
+       }
+
+       if (left - len < 0)
+               return -ENOBUFS;
+
+       if (addr_found) {
+               strncat(str, "@", left);
+               left -= 1;
+       }
+
+       strncat(str, net, left);
+
+       left -= strlen(net) + 1;
+
+       left = cfs_expr2str(&d->ud_net_id.udn_net_num_range, str, left);
+       if (left < 0)
+               return left;
+
+       return 0;
+}
+
+int yaml_add_udsp_action(struct cYAML *y, struct lnet_udsp *udsp)
+{
+       struct cYAML *action;
+
+       switch (udsp->udsp_action_type) {
+               case EN_LNET_UDSP_ACTION_PRIORITY:
+                       action = cYAML_create_object(y, "action");
+                       if (!action)
+                               return -ENOMEM;
+                       if (!cYAML_create_number(action, "priority",
+                               udsp->udsp_action.udsp_priority))
+                               return -ENOMEM;
+
+               default:
+                       return 0;
+       }
+
+       return 0;
+}
+
+int lustre_lnet_show_udsp(int idx, int seq_no, struct cYAML **show_rc,
+                         struct cYAML **err_rc)
+{
+       struct lnet_ioctl_udsp *data = NULL;
+       char *ioctl_buf = NULL;
+       struct lnet_ioctl_udsp get_size;
+       int rc = LUSTRE_CFG_RC_OUT_OF_MEM, i;
+       int l_errno = 0;
+       int use_idx = 0;
+       struct cYAML *root = NULL, *udsp_node = NULL,
+                    *first_seq = NULL;
+       struct cYAML *item = NULL;
+       char err_str[LNET_MAX_STR_LEN];
+       char tmp[LNET_MAX_STR_LEN];
+       struct lnet_udsp *udsp = NULL;
+       bool exist = false;
+
+       snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+
+       root = cYAML_create_object(NULL, NULL);
+       if (!root)
+               goto out;
+
+       udsp_node = cYAML_create_seq(root, "udsp");
+       if (!udsp_node)
+               goto out;
+
+       for (i = 0;; i++) {
+               data = NULL;
+               ioctl_buf = NULL;
+               udsp = NULL;
+
+               LIBCFS_IOC_INIT_V2(get_size, iou_hdr);
+               if (idx != -1)
+                       use_idx = idx;
+               else
+                       use_idx = i;
+
+               get_size.iou_idx = use_idx;
+
+               rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_UDSP_SIZE, &get_size);
+               if (rc != 0) {
+                       l_errno = errno;
+                       break;
+               }
+
+               ioctl_buf = calloc(get_size.iou_idx, 1);
+               if (!ioctl_buf) {
+                       l_errno = errno;
+                       break;
+               }
+
+               data = (struct lnet_ioctl_udsp *)ioctl_buf;
+
+               LIBCFS_IOC_INIT_V2(*data, iou_hdr);
+               data->iou_bulk_size = get_size.iou_idx - sizeof(*data);
+               data->iou_bulk = ioctl_buf + sizeof(*data);
+               data->iou_idx = use_idx;
+
+               rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_UDSP, ioctl_buf);
+               if (rc != 0) {
+                       l_errno = errno;
+                       break;
+               }
+
+               udsp = lnet_udsp_demarshal(ioctl_buf,
+                       data->iou_hdr.ioc_len + data->iou_bulk_size);
+               if (!udsp) {
+                       l_errno = -EFAULT;
+                       break;
+               }
+
+               rc = -EINVAL;
+               exist = true;
+
+               /* create the tree to be printed. */
+               item = cYAML_create_seq_item(udsp_node);
+               if (item == NULL)
+                       goto out;
+
+               if (!first_seq)
+                       first_seq = item;
+
+               if (cYAML_create_number(item, "idx",
+                                       udsp->udsp_idx) == NULL)
+                       goto out;
+
+               memset(tmp, 0, LNET_MAX_STR_LEN);
+               rc = lustre_lnet_nid_descr2str(&udsp->udsp_src, tmp,
+                                              LNET_MAX_STR_LEN);
+
+               if (rc)
+                       goto out;
+
+               if (cYAML_create_string(item, "src", tmp) == NULL)
+                       goto out;
+               memset(tmp, 0, LNET_MAX_STR_LEN);
+               rc = lustre_lnet_nid_descr2str(&udsp->udsp_dst, tmp,
+                                              LNET_MAX_STR_LEN);
+
+               if (rc)
+                       goto out;
+
+               if (cYAML_create_string(item, "dst", tmp) == NULL)
+                       goto out;
+
+               memset(tmp, 0, LNET_MAX_STR_LEN);
+               rc = lustre_lnet_nid_descr2str(&udsp->udsp_rte, tmp,
+                                              LNET_MAX_STR_LEN);
+
+               if (rc)
+                       goto out;
+
+               if (cYAML_create_string(item, "rte", tmp) == NULL)
+                       goto out;
+
+               if (yaml_add_udsp_action(item, udsp))
+                       goto out;
+
+               if (ioctl_buf)
+                       free(ioctl_buf);
+               if (udsp)
+                       lnet_udsp_free(udsp, true);
+               /* did we show the given index? */
+               if (idx != -1)
+                       break;
+       }
+
+       /* Print out the net information only if show_rc is not provided */
+       if (show_rc == NULL)
+               cYAML_print_tree(root);
+
+       if (l_errno != ENOENT) {
+               snprintf(err_str,
+                        sizeof(err_str),
+                        "\"cannot get udsp: %s\"",
+                        strerror(l_errno));
+               rc = -l_errno;
+               goto out;
+       } else {
+               rc = LUSTRE_CFG_RC_NO_ERR;
+       }
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+out:
+       if (ioctl_buf)
+               free(ioctl_buf);
+       if (udsp)
+               lnet_udsp_free(udsp, true);
+
+       if (show_rc == NULL || rc != LUSTRE_CFG_RC_NO_ERR || !exist) {
+               cYAML_free_tree(root);
+       } else if (show_rc != NULL && *show_rc != NULL) {
+               struct cYAML *show_node;
+               /* find the net node, if one doesn't exist
+                * then insert one.  Otherwise add to the one there
+                */
+               show_node = cYAML_get_object_item(*show_rc, "udsp");
+               if (show_node != NULL && cYAML_is_sequence(show_node)) {
+                       cYAML_insert_child(show_node, first_seq);
+                       free(udsp_node);
+                       free(root);
+               } else if (show_node == NULL) {
+                       cYAML_insert_sibling((*show_rc)->cy_child,
+                                               udsp_node);
+                       free(root);
+               } else {
+                       cYAML_free_tree(root);
+               }
+       } else {
+               *show_rc = root;
+       }
+
+       cYAML_build_error(rc, seq_no, SHOW_CMD, "udsp", err_str, err_rc);
+
+       return rc;
+}
+
index 2328d00..13d9d19 100644 (file)
@@ -50,6 +50,7 @@ static int jt_show_stats(int argc, char **argv);
 static int jt_show_peer(int argc, char **argv);
 static int jt_show_recovery(int argc, char **argv);
 static int jt_show_global(int argc, char **argv);
+static int jt_show_udsp(int argc, char **argv);
 static int jt_set_tiny(int argc, char **argv);
 static int jt_set_small(int argc, char **argv);
 static int jt_set_large(int argc, char **argv);
@@ -65,6 +66,8 @@ static int jt_set_max_intf(int argc, char **argv);
 static int jt_set_discovery(int argc, char **argv);
 static int jt_set_drop_asym_route(int argc, char **argv);
 static int jt_list_peer(int argc, char **argv);
+static int jt_add_udsp(int argc, char **argv);
+static int jt_del_udsp(int argc, char **argv);
 /*static int jt_show_peer(int argc, char **argv);*/
 static int lnetctl_list_commands(int argc, char **argv);
 static int jt_import(int argc, char **argv);
@@ -85,6 +88,7 @@ static int jt_set_peer_ni_value(int argc, char **argv);
 static int jt_calc_service_id(int argc, char **argv);
 static int jt_set_response_tracking(int argc, char **argv);
 static int jt_set_recovery_limit(int argc, char **argv);
+static int jt_udsp(int argc, char **argv);
 
 command_t cmd_list[] = {
        {"lnet", jt_lnet, 0, "lnet {configure | unconfigure} [--all]"},
@@ -106,6 +110,7 @@ command_t cmd_list[] = {
        {"ping", jt_ping, 0, "ping nid,[nid,...]"},
        {"discover", jt_discover, 0, "discover nid[,nid,...]"},
        {"service-id", jt_calc_service_id, 0, "Calculate IB Lustre service ID\n"},
+       {"udsp", jt_udsp, 0, "udsp {add | del | help}"},
        {"help", Parser_help, 0, "help"},
        {"exit", Parser_quit, 0, "quit"},
        {"quit", Parser_quit, 0, "quit"},
@@ -259,6 +264,21 @@ command_t peer_cmds[] = {
        { 0, 0, 0, NULL }
 };
 
+command_t udsp_cmds[] = {
+       {"add", jt_add_udsp, 0, "add a udsp\n"
+        "\t--src: ip2nets syntax specifying the local NID to match\n"
+        "\t--dst: ip2nets syntax specifying the remote NID to match\n"
+        "\t--rte: ip2nets syntax specifying the router NID to match\n"
+        "\t--priority: priority value (0 - highest priority)\n"
+        "\t--idx: index of where to insert the rule.\n"
+        "\t       By default, appends to the end of the rule list.\n"},
+       {"del", jt_del_udsp, 0, "delete a udsp\n"
+       "\t--idx: index of the Policy.\n"},
+       {"show", jt_show_udsp, 0, "show udsps\n"
+        "\t --idx: index of the policy to show.\n"},
+       { 0, 0, 0, NULL }
+};
+
 static int jt_calc_service_id(int argc, char **argv)
 {
        int rc;
@@ -1380,6 +1400,48 @@ static int jt_show_stats(int argc, char **argv)
        return rc;
 }
 
+static int jt_show_udsp(int argc, char **argv)
+{
+       int idx = -1;
+       int rc, opt;
+       struct cYAML *err_rc = NULL, *show_rc = NULL;
+
+       const char *const short_options = "i:";
+       static const struct option long_options[] = {
+               { .name = "idx", .has_arg = required_argument, .val = 'i' },
+               { .name = NULL }
+       };
+
+       rc = check_cmd(udsp_cmds, "udsp", "show", 0, argc, argv);
+       if (rc)
+               return rc;
+
+       while ((opt = getopt_long(argc, argv, short_options,
+                                  long_options, NULL)) != -1) {
+               switch (opt) {
+               case 'i':
+                       idx = atoi(optarg);
+                       break;
+               case '?':
+                       print_help(net_cmds, "net", "show");
+               default:
+                       return 0;
+               }
+       }
+
+       rc = lustre_lnet_show_udsp(idx, -1, &show_rc, &err_rc);
+
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+       else if (show_rc)
+               cYAML_print_tree(show_rc);
+
+       cYAML_free_tree(err_rc);
+       cYAML_free_tree(show_rc);
+
+       return rc;
+}
+
 static int jt_show_global(int argc, char **argv)
 {
        int rc;
@@ -1570,6 +1632,17 @@ static int jt_set(int argc, char **argv)
        return Parser_execarg(argc - 1, &argv[1], set_cmds);
 }
 
+static int jt_udsp(int argc, char **argv)
+{
+       int rc;
+
+       rc = check_cmd(udsp_cmds, "udsp", NULL, 2, argc, argv);
+       if (rc)
+               return rc;
+
+       return Parser_execarg(argc - 1, &argv[1], udsp_cmds);
+}
+
 static int jt_import(int argc, char **argv)
 {
        char *file = NULL;
@@ -1804,6 +1877,13 @@ static int jt_export(int argc, char **argv)
                err_rc = NULL;
        }
 
+       rc = lustre_lnet_show_udsp(-1, -1, &show_rc, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR) {
+               cYAML_print_tree2file(stderr, err_rc);
+               cYAML_free_tree(err_rc);
+               err_rc = NULL;
+       }
+
        if (show_rc != NULL) {
                cYAML_print_tree2file(f, show_rc);
                cYAML_free_tree(show_rc);
@@ -2054,6 +2134,109 @@ static int jt_discover(int argc, char **argv)
        return rc;
 }
 
+static int jt_add_udsp(int argc, char **argv)
+{
+       char *src = NULL, *dst = NULL, *rte = NULL;
+       struct cYAML *err_rc = NULL;
+       union lnet_udsp_action udsp_action;
+       long int idx = -1, priority = -1;
+       int opt, rc = 0;
+       char *action_type = "pref";
+
+       const char *const short_options = "s:d:r:p:i:";
+       static const struct option long_options[] = {
+       { .name = "src",         .has_arg = required_argument, .val = 's' },
+       { .name = "dst",         .has_arg = required_argument, .val = 'd' },
+       { .name = "rte",         .has_arg = required_argument, .val = 'r' },
+       { .name = "priority",    .has_arg = required_argument, .val = 'p' },
+       { .name = "idx",         .has_arg = required_argument, .val = 'i' },
+       { .name = NULL } };
+
+       rc = check_cmd(udsp_cmds, "udsp", "add", 0, argc, argv);
+       if (rc)
+               return rc;
+
+       while ((opt = getopt_long(argc, argv, short_options,
+                                 long_options, NULL)) != -1) {
+               switch (opt) {
+               case 's':
+                       src = optarg;
+                       break;
+               case 'd':
+                       dst = optarg;
+                       break;
+               case 'r':
+                       rte = optarg;
+                       break;
+               case 'p':
+                       rc = parse_long(optarg, &priority);
+                       if (rc != 0)
+                               priority = -1;
+                       action_type = "priority";
+                       udsp_action.udsp_priority = priority;
+                       break;
+               case 'i':
+                       rc = parse_long(optarg, &idx);
+                       if (rc != 0)
+                               idx = 0;
+                       break;
+               case '?':
+                       print_help(udsp_cmds, "udsp", "add");
+               default:
+                       return 0;
+               }
+       }
+
+       rc = lustre_lnet_add_udsp(src, dst, rte, action_type, &udsp_action,
+                                 idx, -1, &err_rc);
+
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+
+       cYAML_free_tree(err_rc);
+
+       return rc;
+}
+
+static int jt_del_udsp(int argc, char **argv)
+{
+       struct cYAML *err_rc = NULL;
+       long int idx = 0;
+       int opt, rc = 0;
+
+       const char *const short_options = "i:";
+       static const struct option long_options[] = {
+       { .name = "idx",        .has_arg = required_argument, .val = 'i' },
+       { .name = NULL } };
+
+       rc = check_cmd(udsp_cmds, "udsp", "del", 0, argc, argv);
+       if (rc)
+               return rc;
+
+       while ((opt = getopt_long(argc, argv, short_options,
+                                 long_options, NULL)) != -1) {
+               switch (opt) {
+               case 'i':
+                       rc = parse_long(optarg, &idx);
+                       if (rc != 0)
+                               idx = 0;
+                       break;
+               case '?':
+                       print_help(udsp_cmds, "udsp", "add");
+               default:
+                       return 0;
+               }
+       }
+
+       rc = lustre_lnet_del_udsp(idx, -1, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+
+       cYAML_free_tree(err_rc);
+
+       return rc;
+}
+
 static int lnetctl_list_commands(int argc, char **argv)
 {
        char buffer[81] = ""; /* 80 printable chars + terminating NUL */
index 6e04865..f2a2793 100644 (file)
@@ -385,6 +385,117 @@ Show details on configured peer credits
 .br
 \-> Minimum router credits\.
 .
+.SS "UDSP Configuration"
+.
+.TP
+\fBlnetctl udsp\fR add
+Add user-defined selection policy.
+.
+.br
+.
+.TP
+Adding a local network udsp.
+.
+.br
+If multiple local networks are available, each one can be assigned a priority\.
+The one with the highest priority is selected to send on\.
+NID and network matching is using NID-range syntax, please see the manual for more detail\.
+.
+.br
+\-\-src : network in NID-range syntax (e.g. tcp0 or tcp[1-3])
+.
+.br
+\-\-<priority> <priority value>: optional priority value in [0-255], 0 as the highest
+.
+.br
+\-\-<idx>: The index of where to insert the rule\. By default append to the end of the list
+.
+.br
+.
+.TP
+Adding a local NID udsp.
+.
+.br
+Assign priority to local NIDs\. After a local network is chosen, the NI with highest priority is selected\.
+.
+.br
+\-\-src: NID in NID-range syntax (e.g. 10.1.1.2@tcp or 10.1.1.*@tcp)
+.
+.br
+\-\-<priority> <priority value>: optional priority value in [0-255], 0 as the highest
+.
+.br
+\-\-<idx>: The index of where to insert the rule\. By default append to the end of the list
+.
+.br
+.
+.TP
+Adding a peer NID udsp.
+.
+.br
+Assign priority to peer NIDs. Peer NID with highest priority is selected to send to\.
+.
+.br
+\-\-dst: NID in NID-range syntax (e.g. 10.1.1.2@tcp)
+.
+.br
+\-\-<priority> <priority value>: optional priority value in [0-255], 0 as the highest
+.
+.br
+\-\-<idx>: The index of where to insert the rule\. By default append to the end of the list
+.
+.br
+.
+.TP
+Adding a NID pair udsp.
+.
+.br
+The local NIDs which match the rule are added on a list on the peer NIs matching the rule\.
+When selecting the peer NI, the one with the local NID being used on its list is preferred\.
+.
+.br
+\-\-dst: NID in NID-range syntax (e.g. 10.1.1.1@tcp)
+.
+.br
+\-\-src: NID in NID-range syntax (e.g. 10.1.1.2@tcp)
+.
+.br
+\-\-<idx>: The index of where to insert the rule\. By default append to the end of the list
+.
+.br
+.
+.TP
+Adding a Peer Router udsp.
+.
+.br
+The router NIDs matching the rule are added on a list on the peer NIs matching the rule\.
+When sending to a remote peer, the router which has its nid on the peer NI list is preferred\.
+.
+.br
+\-\-dst: peer NID in NID-range syntax (e.g. 10.1.1.1@tcp)
+.
+.br
+\-\-rte: router NID in NID-range syntax (e.g. 10.1.2.1@tcp)
+.
+.br
+\-\-<idx>: The index of where to insert the rule\. By default append to the end of the list
+.
+.br
+.
+.TP
+\fBlnetctl udsp\fR del
+Delete user-defined selection policy.
+.
+.br
+\-\-idx: The index of the rule to delete\.
+.
+.br
+.TP
+\fBlnetctl udsp\fR show
+Show all user-defined selection policies in the system\. The policies are dumped in YAML form\.
+.
+.br
+.
 .SH "OPTIONS"
 .TP
 .B --list-commands
@@ -716,6 +827,53 @@ peer:
           state: NA
 .
 .br
+.
+.SS "Adding a UDSP"
+.
+.IP "\(bu" 4
+lnetctl udsp add \-\-src tcp \-\-priority 1
+.
+.IP "" 0
+.
+.P
+.
+.SS "Deleting a UDSP"
+.
+.IP "\(bu" 4
+lnetctl udsp del \-\-idx 0
+.
+.IP "" 0
+.
+.P
+.SS "Show UDSPs"
+.
+.IP "\(bu" 4
+lnetctl udsp show
+.
+.IP "" 0
+.
+.P
+udsp:
+.
+.br
+    \- idx: 0
+.
+.br
+    src: tcp
+.
+.br
+    dst: NA
+.
+.br
+    rte: NA
+.
+.br
+    action:
+.
+.br
+        priority: 0
+.
+.br
 
 .SH SEE ALSO
 .BR lustre (7)