Whamcloud - gitweb
LU-7734 lnet: Multi-Rail local_ni/peer_ni selection
authorAmir Shehata <amir.shehata@intel.com>
Tue, 5 Jan 2016 00:02:25 +0000 (16:02 -0800)
committerAmir Shehata <amir.shehata@intel.com>
Wed, 25 Jan 2017 02:38:17 +0000 (18:38 -0800)
This patch implements the local_ni/peer_ni selection algorithm.
It adds APIs to the peer module to encapsulate
iterating through the peer_nis in a peer and creating a peer.

Signed-off-by: Amir Shehata <amir.shehata@intel.com>
Change-Id: Ifc0e5ebf84ab25753adfcfcb433b024100f35ace
Reviewed-on: http://review.whamcloud.com/18383
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Olaf Weber <olaf@sgi.com>
Tested-by: Jenkins
Tested-by: Doug Oucharek <doug.s.oucharek@intel.com>
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/lnet/api-ni.c
lnet/lnet/lib-move.c
lnet/lnet/peer.c

index 0d3fc99..eb77d0c 100644 (file)
@@ -496,6 +496,7 @@ extern lnet_ni_t *lnet_nid2ni_addref(lnet_nid_t nid);
 extern lnet_ni_t *lnet_net2ni_locked(__u32 net, int cpt);
 extern lnet_ni_t *lnet_net2ni(__u32 net);
 bool lnet_is_ni_healthy_locked(struct lnet_ni *ni);
+struct lnet_net *lnet_get_net_locked(__u32 net_id);
 
 int lnet_lib_init(void);
 void lnet_lib_exit(void);
@@ -789,13 +790,24 @@ int lnet_parse_networks(struct list_head *nilist, char *networks,
 bool lnet_net_unique(__u32 net_id, struct list_head *nilist,
                     struct lnet_net **net);
 bool lnet_ni_unique_net(struct list_head *nilist, char *iface);
-
+void lnet_incr_dlc_seq(void);
+__u32 lnet_get_dlc_seq_locked(void);
+
+struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
+                                                 struct lnet_peer_net *peer_net,
+                                                 struct lnet_peer_ni *prev);
+int lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt,
+                                   struct lnet_peer **peer);
 int lnet_nid2peerni_locked(struct lnet_peer_ni **lpp, lnet_nid_t nid, int cpt);
 struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid, int cpt);
 void lnet_peer_tables_cleanup(lnet_ni_t *ni);
 void lnet_peer_tables_destroy(void);
 int lnet_peer_tables_create(void);
 void lnet_debug_peer(lnet_nid_t nid);
+struct lnet_peer_net *lnet_peer_get_net_locked(struct lnet_peer *peer,
+                                              __u32 net_id);
+bool lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni,
+                                struct lnet_ni *ni);
 int lnet_get_peer_info(__u32 peer_index, __u64 *nid,
                       char alivness[LNET_MAX_STR_LEN],
                       __u32 *cpt_iter, __u32 *refcount,
@@ -803,6 +815,45 @@ int lnet_get_peer_info(__u32 peer_index, __u64 *nid,
                       __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
                       __u32 *peer_tx_qnob);
 
+static inline bool
+lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
+{
+       return lpni->lpni_healthy;
+}
+
+static inline void
+lnet_set_peer_ni_health_locked(struct lnet_peer_ni *lpni, bool health)
+{
+       lpni->lpni_healthy = health;
+}
+
+static inline bool
+lnet_is_peer_net_healthy_locked(struct lnet_peer_net *peer_net)
+{
+       struct lnet_peer_ni *lpni;
+
+       list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
+                           lpni_on_peer_net_list) {
+               if (lnet_is_peer_ni_healthy_locked(lpni))
+                       return true;
+       }
+
+       return false;
+}
+
+static inline bool
+lnet_is_peer_healthy_locked(struct lnet_peer *peer)
+{
+       struct lnet_peer_net *peer_net;
+
+       list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+               if (lnet_is_peer_net_healthy_locked(peer_net))
+                       return true;
+       }
+
+       return false;
+}
+
 static inline void
 lnet_peer_set_alive(struct lnet_peer_ni *lp)
 {
index 357f67a..c073953 100644 (file)
@@ -379,6 +379,9 @@ typedef struct lnet_ni {
        /* lnd tunables set explicitly */
        bool ni_lnd_tunables_set;
 
+       /* sequence number used to round robin over nis within a net */
+       __u32                   ni_seq;
+
        /*
         * equivalent interfaces to use
         * This is an array because socklnd bonding can still be configured
@@ -421,7 +424,6 @@ typedef struct {
 struct lnet_peer_ni {
        /* cahian on peer_net */
        struct list_head        lpni_on_peer_net_list;
-
        /* chain on peer hash */
        struct list_head        lpni_hashlist;
        /* messages blocking for tx credits */
@@ -474,10 +476,20 @@ struct lnet_peer_ni {
        int                     lpni_cpt;
        /* # refs from lnet_route_t::lr_gateway */
        int                     lpni_rtr_refcount;
+       /* sequence number used to round robin over peer nis within a net */
+       __u32                   lpni_seq;
+       /* health flag */
+       bool                    lpni_healthy;
        /* returned RC ping features */
        unsigned int            lpni_ping_feats;
-       struct list_head        lpni_routes;    /* routers on this peer */
-       lnet_rc_data_t          *lpni_rcd;      /* router checker state */
+       /* routes on this peer */
+       struct list_head        lpni_routes;
+       /* array of preferred local nids */
+       lnet_nid_t              *lpni_pref_nids;
+       /* number of preferred NIDs in lnpi_pref_nids */
+       __u32                   lpni_pref_nnids;
+       /* router checker state */
+       lnet_rc_data_t          *lpni_rcd;
 };
 
 struct lnet_peer {
@@ -489,6 +501,9 @@ struct lnet_peer {
 
        /* primary NID of the peer */
        lnet_nid_t              lp_primary_nid;
+
+       /* peer is Multi-Rail enabled peer */
+       bool                    lp_multi_rail;
 };
 
 struct lnet_peer_net {
@@ -503,6 +518,9 @@ struct lnet_peer_net {
 
        /* Net ID */
        __u32                   lpn_net_id;
+
+       /* health flag */
+       bool                    lpn_healthy;
 };
 
 /* peer hash size */
index 75116ad..8f5dc24 100644 (file)
@@ -62,6 +62,15 @@ module_param(use_tcp_bonding, int, 0444);
 MODULE_PARM_DESC(use_tcp_bonding,
                 "Set to 1 to use socklnd bonding. 0 to use Multi-Rail");
 
+/*
+ * This sequence number keeps track of how many times DLC was used to
+ * update the configuration. It is incremented on any DLC update and
+ * checked when sending a message to determine if there is a need to
+ * re-run the selection algorithm to handle configuration change.
+ * Look at lnet_select_pathway() for more details on its usage.
+ */
+static atomic_t lnet_dlc_seq_no = ATOMIC_INIT(0);
+
 static int lnet_ping(lnet_process_id_t id, signed long timeout,
                     lnet_process_id_t __user *ids, int n_ids);
 
@@ -1565,6 +1574,7 @@ lnet_startup_lndnet(struct lnet_net *net, struct lnet_lnd_tunables *tun)
 
        lnet_net_lock(LNET_LOCK_EX);
        list_splice_tail(&local_ni_list, &net_l->net_ni_list);
+       lnet_incr_dlc_seq();
        lnet_net_unlock(LNET_LOCK_EX);
 
        /* if the network is not unique then we don't want to keep
@@ -2243,6 +2253,16 @@ out:
        return rc;
 }
 
+void lnet_incr_dlc_seq(void)
+{
+       atomic_inc(&lnet_dlc_seq_no);
+}
+
+__u32 lnet_get_dlc_seq_locked(void)
+{
+       return atomic_read(&lnet_dlc_seq_no);
+}
+
 /**
  * LNet ioctl handler.
  *
index 904c7c2..769856c 100644 (file)
@@ -627,12 +627,11 @@ lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
        if (len != 0)
                lnet_setpayloadbuffer(msg);
 
-       memset(&msg->msg_hdr, 0, sizeof(msg->msg_hdr));
-       msg->msg_hdr.type           = cpu_to_le32(type);
-       msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
-       msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+       memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+       msg->msg_hdr.type           = cpu_to_le32(type);
+       msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
        /* src_nid will be set later */
-       msg->msg_hdr.src_pid        = cpu_to_le32(the_lnet.ln_pid);
+       msg->msg_hdr.src_pid        = cpu_to_le32(the_lnet.ln_pid);
        msg->msg_hdr.payload_length = cpu_to_le32(len);
 }
 
@@ -1027,6 +1026,15 @@ lnet_return_tx_credits_locked(lnet_msg_t *msg)
        }
 
        if (txpeer != NULL) {
+               /*
+                * TODO:
+                * Once the patch for the health comes in we need to set
+                * the health of the peer ni to bad when we fail to send
+                * a message.
+                * int status = msg->msg_ev.status;
+                * if (status != 0)
+                *      lnet_set_peer_ni_health_locked(txpeer, false)
+                */
                msg->msg_txpeer = NULL;
                lnet_peer_ni_decref_locked(txpeer);
        }
@@ -1154,41 +1162,52 @@ routing_off:
 }
 
 static int
+lnet_compare_peers(struct lnet_peer_ni *p1, struct lnet_peer_ni *p2)
+{
+       if (p1->lpni_txqnob < p2->lpni_txqnob)
+               return 1;
+
+       if (p1->lpni_txqnob > p2->lpni_txqnob)
+               return -1;
+
+       if (p1->lpni_txcredits > p2->lpni_txcredits)
+               return 1;
+
+       if (p1->lpni_txcredits < p2->lpni_txcredits)
+               return -1;
+
+       return 0;
+}
+
+static int
 lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2)
 {
        struct lnet_peer_ni *p1 = r1->lr_gateway;
        struct lnet_peer_ni *p2 = r2->lr_gateway;
        int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops;
        int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops;
+       int rc;
 
        if (r1->lr_priority < r2->lr_priority)
                return 1;
 
        if (r1->lr_priority > r2->lr_priority)
-               return -ERANGE;
+               return -1;
 
        if (r1_hops < r2_hops)
                return 1;
 
        if (r1_hops > r2_hops)
-               return -ERANGE;
-
-       if (p1->lpni_txqnob < p2->lpni_txqnob)
-               return 1;
-
-       if (p1->lpni_txqnob > p2->lpni_txqnob)
-               return -ERANGE;
-
-       if (p1->lpni_txcredits > p2->lpni_txcredits)
-               return 1;
+               return -1;
 
-       if (p1->lpni_txcredits < p2->lpni_txcredits)
-               return -ERANGE;
+       rc = lnet_compare_peers(p1, p2);
+       if (rc)
+               return rc;
 
        if (r1->lr_seq - r2->lr_seq <= 0)
                return 1;
 
-       return -ERANGE;
+       return -1;
 }
 
 static struct lnet_peer_ni *
@@ -1250,166 +1269,426 @@ lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
        return lpni_best;
 }
 
-int
-lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+static int
+lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
+                   struct lnet_msg *msg, lnet_nid_t rtr_nid, bool *lo_sent)
 {
-       lnet_nid_t              dst_nid = msg->msg_target.nid;
-       struct lnet_ni          *src_ni;
-       struct lnet_ni          *local_ni;
-       struct lnet_peer_ni     *lp;
-       int                     cpt;
-       int                     cpt2;
-       int                     rc;
-
-       /* NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
-        * but we might want to use pre-determined router for ACK/REPLY
-        * in the future */
-       /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
-       LASSERT(msg->msg_txpeer == NULL);
-       LASSERT(!msg->msg_sending);
-       LASSERT(!msg->msg_target_is_router);
-       LASSERT(!msg->msg_receiving);
+       struct lnet_ni          *best_ni = NULL;
+       struct lnet_peer_ni     *best_lpni = NULL;
+       struct lnet_peer_ni     *net_gw = NULL;
+       struct lnet_peer_ni     *best_gw = NULL;
+       struct lnet_peer_ni     *lpni;
+       struct lnet_peer        *peer = NULL;
+       struct lnet_peer_net    *peer_net;
+       struct lnet_net         *local_net;
+       struct lnet_ni          *ni = NULL;
+       int                     cpt, cpt2, rc;
+       bool                    routing = false;
+       bool                    ni_is_pref = false;
+       bool                    preferred = false;
+       int                     best_credits = 0;
+       __u32                   seq, seq2;
+       int                     best_lpni_credits = INT_MIN;
+
+again:
+       /*
+        * get an initial CPT to use for locking. The idea here is not to
+        * serialize the calls to select_pathway, so that as many
+        * operations can run concurrently as possible. To do that we use
+        * the CPT where this call is being executed. Later on when we
+        * determine the CPT to use in lnet_message_commit, we switch the
+        * lock and check if there was any configuration changes, if none,
+        * then we proceed, if there is, then we'll need to update the cpt
+        * and redo the operation.
+        */
+       cpt = lnet_net_lock_current();
 
-       msg->msg_sending = 1;
+       best_gw = NULL;
+       routing = false;
+       local_net = NULL;
+       best_ni = NULL;
 
-       LASSERT(!msg->msg_tx_committed);
-       local_ni = lnet_net2ni(LNET_NIDNET(dst_nid));
-       cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid,
-                             local_ni);
- again:
-       if (the_lnet.ln_shutdown)
+       if (the_lnet.ln_shutdown) {
+               lnet_net_unlock(cpt);
                return -ESHUTDOWN;
-       lnet_net_lock(cpt);
+       }
 
-       if (src_nid == LNET_NID_ANY) {
-               src_ni = NULL;
-       } else {
-               src_ni = lnet_nid2ni_locked(src_nid, cpt);
-               if (src_ni == NULL) {
+       /*
+        * initialize the variables which could be reused if we go to
+        * again
+        */
+       lpni = NULL;
+       seq = lnet_get_dlc_seq_locked();
+
+       rc = lnet_find_or_create_peer_locked(dst_nid, cpt, &peer);
+       if (rc != 0) {
+               lnet_net_unlock(cpt);
+               return rc;
+       }
+
+       /* If peer is not healthy then can not send anything to it */
+       if (!lnet_is_peer_healthy_locked(peer)) {
+               lnet_net_unlock(cpt);
+               return -EHOSTUNREACH;
+       }
+
+       /*
+        * STEP 1: first jab at determineing best_ni
+        * if src_nid is explicitly specified, then best_ni is already
+        * pre-determiend for us. Otherwise we need to select the best
+        * one to use later on
+        */
+       if (src_nid != LNET_NID_ANY) {
+               best_ni = lnet_nid2ni_locked(src_nid, cpt);
+               if (!best_ni) {
                        lnet_net_unlock(cpt);
                        LCONSOLE_WARN("Can't send to %s: src %s is not a "
                                      "local nid\n", libcfs_nid2str(dst_nid),
                                      libcfs_nid2str(src_nid));
                        return -EINVAL;
                }
-               LASSERT(!msg->msg_routing);
-       }
-
-       /* Is this for someone on a local network? */
-       local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt);
 
-       if (local_ni != NULL) {
-               if (src_ni == NULL) {
-                       src_ni = local_ni;
-                       src_nid = src_ni->ni_nid;
-               } else if (src_ni != local_ni) {
+               if (best_ni->ni_net->net_id != LNET_NIDNET(dst_nid)) {
                        lnet_net_unlock(cpt);
                        LCONSOLE_WARN("No route to %s via from %s\n",
                                      libcfs_nid2str(dst_nid),
                                      libcfs_nid2str(src_nid));
                        return -EINVAL;
                }
+       }
 
-               LASSERT(src_nid != LNET_NID_ANY);
+       if (best_ni == the_lnet.ln_loni) {
+               /* No send credit hassles with LOLND */
+               msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
+               if (!msg->msg_routing)
+                       msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
+               msg->msg_target.nid = best_ni->ni_nid;
                lnet_msg_commit(msg, cpt);
 
-               if (!msg->msg_routing)
-                       msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+               lnet_ni_addref_locked(best_ni, cpt);
+               lnet_net_unlock(cpt);
+               msg->msg_txni = best_ni;
+               lnet_ni_send(best_ni, msg);
 
-               if (src_ni == the_lnet.ln_loni) {
-                       /* No send credit hassles with LOLND */
-                       lnet_net_unlock(cpt);
-                       lnet_ni_send(src_ni, msg);
-                       return 0;
-               }
+               *lo_sent = true;
+               return 0;
+       }
 
-               rc = lnet_nid2peerni_locked(&lp, dst_nid, cpt);
-               if (rc != 0) {
-                       lnet_net_unlock(cpt);
-                       LCONSOLE_WARN("Error %d finding peer %s\n", rc,
-                                     libcfs_nid2str(dst_nid));
-                       /* ENOMEM or shutting down */
-                       return rc;
-               }
-               LASSERT (lp->lpni_net == src_ni->ni_net);
-       } else {
-               /* sending to a remote network */
-               lp = lnet_find_route_locked(src_ni != NULL ?
-                                           src_ni->ni_net : NULL,
-                                           dst_nid, rtr_nid);
-               if (lp == NULL) {
-                       lnet_net_unlock(cpt);
+       if (best_ni)
+               goto pick_peer;
 
-                       LCONSOLE_WARN("No route to %s via %s "
-                                     "(all routers down)\n",
-                                     libcfs_id2str(msg->msg_target),
-                                     libcfs_nid2str(src_nid));
-                       return -EHOSTUNREACH;
+       /*
+        * Decide whether we need to route to peer_ni.
+        * Get the local net that I need to be on to be able to directly
+        * send to that peer.
+        *
+        * a. Find the peer which the dst_nid belongs to.
+        * b. Iterate through each of the peer_nets/nis to decide
+        * the best peer/local_ni pair to use
+        */
+       list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+               if (!lnet_is_peer_net_healthy_locked(peer_net))
+                       continue;
+
+               local_net = lnet_get_net_locked(peer_net->lpn_net_id);
+               if (!local_net) {
+                       /*
+                        * go through each peer_ni on that peer_net and
+                        * determine the best possible gw to go through
+                        */
+                       list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
+                                           lpni_on_peer_net_list) {
+                               net_gw = lnet_find_route_locked(NULL,
+                                                               lpni->lpni_nid,
+                                                               rtr_nid);
+
+                               /*
+                                * if no route is found for that network then
+                                * move onto the next peer_ni in the peer
+                                */
+                               if (!net_gw)
+                                       continue;
+
+                               if (!best_gw) {
+                                       best_gw = net_gw;
+                                       best_lpni = lpni;
+                               } else  {
+                                       rc = lnet_compare_peers(net_gw,
+                                                               best_gw);
+                                       if (rc > 0) {
+                                               best_gw = net_gw;
+                                               best_lpni = lpni;
+                                       }
+                               }
+                       }
+
+                       if (!best_gw)
+                               continue;
+
+                       local_net = lnet_get_net_locked
+                                       (LNET_NIDNET(best_gw->lpni_nid));
+                       routing = true;
+               } else {
+                       routing = false;
+                       best_gw = NULL;
                }
 
-               /* rtr_nid is LNET_NID_ANY or NID of pre-determined router,
-                * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't
-                * pre-determined router, this can happen if router table
-                * was changed when we release the lock */
-               if (rtr_nid != lp->lpni_nid) {
-                       cpt2 = lp->lpni_cpt;
-                       if (cpt2 != cpt) {
-                               lnet_net_unlock(cpt);
-
-                               rtr_nid = lp->lpni_nid;
-                               cpt = cpt2;
-                               goto again;
+               /* no routable net found go on to a different net */
+               if (!local_net)
+                       continue;
+
+               /*
+                * Second jab at determining best_ni
+                * if we get here then the peer we're trying to send
+                * to is on a directly connected network, and we'll
+                * need to pick the local_ni on that network to send
+                * from
+                */
+               while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
+                       if (!lnet_is_ni_healthy_locked(ni))
+                               continue;
+                       /* TODO: compare NUMA distance */
+                       if (ni->ni_tx_queues[cpt]->tq_credits <=
+                           best_credits) {
+                               /*
+                                * all we want is to read tq_credits
+                                * value as an approximation of how
+                                * busy the NI is. No need to grab a lock
+                                */
+                               continue;
+                       } else if (best_ni) {
+                               if ((best_ni)->ni_seq - ni->ni_seq <= 0)
+                                       continue;
+                               (best_ni)->ni_seq = ni->ni_seq + 1;
                        }
+
+                       best_ni = ni;
+                       best_credits = ni->ni_tx_queues[cpt]->tq_credits;
                }
+       }
 
-               CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
-                      libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lpni_nid),
-                      lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+       if (!best_ni) {
+               lnet_net_unlock(cpt);
+               LCONSOLE_WARN("No local ni found to send from to %s\n",
+                       libcfs_nid2str(dst_nid));
+               return -EINVAL;
+       }
 
-               if (src_ni == NULL) {
-                       src_ni = lnet_get_next_ni_locked(lp->lpni_net, NULL);
-                       LASSERT(src_ni != NULL);
-                       src_nid = src_ni->ni_nid;
+       if (routing)
+               goto send;
+
+pick_peer:
+       lpni = NULL;
+
+       if (msg->msg_type == LNET_MSG_REPLY ||
+           msg->msg_type == LNET_MSG_ACK) {
+               /*
+                * for replies we want to respond on the same peer_ni we
+                * received the message on if possible. If not, then pick
+                * a peer_ni to send to
+                */
+               best_lpni = lnet_find_peer_ni_locked(dst_nid, cpt);
+               if (best_lpni) {
+                       lnet_peer_ni_decref_locked(best_lpni);
+                       goto send;
                } else {
-                       LASSERT (src_ni->ni_net == lp->lpni_net);
+                       CDEBUG(D_NET, "unable to send msg_type %d to "
+                             "originating %s\n", msg->msg_type,
+                             libcfs_nid2str(dst_nid));
                }
+       }
 
-               lnet_peer_ni_addref_locked(lp);
+       peer_net = lnet_peer_get_net_locked(peer,
+                                           best_ni->ni_net->net_id);
+       /*
+        * peer_net is not available or the src_nid is explicitly defined
+        * and the peer_net for that src_nid is unhealthy. find a route to
+        * the destination nid.
+        */
+       if (!peer_net ||
+           (src_nid != LNET_NID_ANY &&
+            !lnet_is_peer_net_healthy_locked(peer_net))) {
+               best_gw = lnet_find_route_locked(best_ni->ni_net,
+                                                dst_nid,
+                                                rtr_nid);
+               /*
+                * if no route is found for that network then
+                * move onto the next peer_ni in the peer
+                */
+               if (!best_gw) {
+                       lnet_net_unlock(cpt);
+                       LCONSOLE_WARN("No route to peer from %s\n",
+                               libcfs_nid2str(best_ni->ni_nid));
+                       return -EHOSTUNREACH;
+               }
 
-               LASSERT(src_nid != LNET_NID_ANY);
-               lnet_msg_commit(msg, cpt);
+               CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
+                       libcfs_nid2str(lpni->lpni_nid),
+                       libcfs_nid2str(best_gw->lpni_nid),
+                       lnet_msgtyp2str(msg->msg_type), msg->msg_len);
 
-               if (!msg->msg_routing) {
-                       /* I'm the source and now I know which NI to send on */
-                       msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+               best_lpni = lnet_find_peer_ni_locked(dst_nid, cpt);
+               LASSERT(best_lpni != NULL);
+               lnet_peer_ni_decref_locked(best_lpni);
+
+               routing = true;
+
+               goto send;
+       } else if (!lnet_is_peer_net_healthy_locked(peer_net)) {
+               /*
+                * this peer_net is unhealthy but we still have an opportunity
+                * to find another peer_net that we can use
+                */
+               __u32 net_id = peer_net->lpn_net_id;
+               lnet_net_unlock(cpt);
+               if (!best_lpni)
+                       LCONSOLE_WARN("peer net %s unhealthy\n",
+                                     libcfs_net2str(net_id));
+               goto again;
+       }
+
+       best_lpni = NULL;
+       while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
+               /*
+                * if this peer ni is not healty just skip it, no point in
+                * examining it further
+                */
+               if (!lnet_is_peer_ni_healthy_locked(lpni))
+                       continue;
+               ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
+
+               if (!preferred && ni_is_pref) {
+                       preferred = true;
+               } else if (preferred && !ni_is_pref) {
+                       continue;
+               } if (lpni->lpni_txcredits <= best_lpni_credits)
+                       continue;
+               else if (best_lpni) {
+                       if (best_lpni->lpni_seq - lpni->lpni_seq <= 0)
+                               continue;
+                       best_lpni->lpni_seq = lpni->lpni_seq + 1;
                }
 
-               msg->msg_target_is_router = 1;
-               msg->msg_target.nid = lp->lpni_nid;
-               msg->msg_target.pid = LNET_PID_LUSTRE;
+               best_lpni = lpni;
+               best_lpni_credits = lpni->lpni_txcredits;
        }
 
-       /* 'lp' is our best choice of peer */
+       /* if we still can't find a peer ni then we can't reach it */
+       if (!best_lpni) {
+               __u32 net_id = peer_net->lpn_net_id;
+               lnet_net_unlock(cpt);
+               LCONSOLE_WARN("no peer_ni found on peer net %s\n",
+                               libcfs_net2str(net_id));
+               goto again;
+       }
 
-       LASSERT(!msg->msg_peertxcredit);
-       LASSERT(!msg->msg_txcredit);
-       LASSERT(msg->msg_txpeer == NULL);
+send:
+       /*
+        * determine the cpt to use and if it has changed then
+        * lock the new cpt and check if the config has changed.
+        * If it has changed then repeat the algorithm since the
+        * ni or peer list could have changed and the algorithm
+        * would endup picking a different ni/peer_ni pair.
+        */
+       cpt2 = best_lpni->lpni_cpt;
+       if (cpt != cpt2) {
+               lnet_net_unlock(cpt);
+               cpt = cpt2;
+               lnet_net_lock(cpt);
+               seq2 = lnet_get_dlc_seq_locked();
+               if (seq2 != seq) {
+                       lnet_net_unlock(cpt);
+                       goto again;
+               }
+       }
 
-       msg->msg_txpeer = lp;                   /* msg takes my ref on lp */
-       /* set the NI for this message */
-       msg->msg_txni = src_ni;
+       /*
+        * store the best_lpni in the message right away to avoid having
+        * to do the same operation under different conditions
+        */
+       msg->msg_txpeer = (routing) ? best_gw : best_lpni;
+       msg->msg_txni = best_ni;
+       /*
+        * grab a reference for the best_ni since now it's in use in this
+        * send. the reference will need to be dropped when the message is
+        * finished in lnet_finalize()
+        */
        lnet_ni_addref_locked(msg->msg_txni, cpt);
+       lnet_peer_ni_addref_locked(msg->msg_txpeer);
+
+       /*
+        * set the destination nid in the message here because it's
+        * possible that we'd be sending to a different nid than the one
+        * originaly given.
+        */
+       msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
+
+       /*
+        * Always set the target.nid to the best peer picked. Either the
+        * nid will be one of the preconfigured NIDs, or the same NID as
+        * what was originaly set in the target or it will be the NID of
+        * a router if this message should be routed
+        */
+       msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
+
+       /*
+        * lnet_msg_commit assigns the correct cpt to the message, which
+        * is used to decrement the correct refcount on the ni when it's
+        * time to return the credits
+        */
+       lnet_msg_commit(msg, cpt);
+
+       /*
+        * If we are routing the message then we don't need to overwrite
+        * the src_nid since it would've been set at the origin. Otherwise
+        * we are the originator so we need to set it.
+        */
+       if (!msg->msg_routing)
+               msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
+
+       if (routing) {
+               msg->msg_target_is_router = 1;
+               msg->msg_target.pid = LNET_PID_LUSTRE;
+       }
 
        rc = lnet_post_send_locked(msg, 0);
+
        lnet_net_unlock(cpt);
 
-       if (rc < 0)
+       return rc;
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
+{
+       lnet_nid_t              dst_nid = msg->msg_target.nid;
+       int                     rc;
+       bool                    lo_sent = false;
+
+       /*
+        * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases,
+        * but we might want to use pre-determined router for ACK/REPLY
+        * in the future
+        */
+       /* NB: ni != NULL == interface pre-determined (ACK/REPLY) */
+       LASSERT (msg->msg_txpeer == NULL);
+       LASSERT (!msg->msg_sending);
+       LASSERT (!msg->msg_target_is_router);
+       LASSERT (!msg->msg_receiving);
+
+       msg->msg_sending = 1;
+
+       LASSERT(!msg->msg_tx_committed);
+
+       rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid, &lo_sent);
+       if (rc < 0 || lo_sent)
                return rc;
 
        if (rc == LNET_CREDIT_OK)
-               lnet_ni_send(src_ni, msg);
+               lnet_ni_send(msg->msg_txni, msg);
 
-       return 0; /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */
+       /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */
+       return 0;
 }
 
 void
index 6a6f56b..0276756 100644 (file)
@@ -224,6 +224,93 @@ lnet_find_peer_ni_locked(lnet_nid_t nid, int cpt)
        return lpni;
 }
 
+int
+lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt, struct lnet_peer **peer)
+{
+       struct lnet_peer_ni *lpni;
+
+       lpni = lnet_find_peer_ni_locked(dst_nid, cpt);
+       if (!lpni) {
+               int rc;
+               rc = lnet_nid2peerni_locked(&lpni, dst_nid, cpt);
+               if (rc != 0)
+                       return rc;
+       }
+
+       *peer = lpni->lpni_peer_net->lpn_peer;
+       lnet_peer_ni_decref_locked(lpni);
+
+       return 0;
+}
+
+struct lnet_peer_ni *
+lnet_get_next_peer_ni_locked(struct lnet_peer *peer,
+                            struct lnet_peer_net *peer_net,
+                            struct lnet_peer_ni *prev)
+{
+       struct lnet_peer_ni *lpni;
+       struct lnet_peer_net *net = peer_net;
+
+       if (!prev) {
+               if (!net)
+                       net = list_entry(peer->lp_peer_nets.next,
+                                        struct lnet_peer_net,
+                                        lpn_on_peer_list);
+               lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
+                                 lpni_on_peer_net_list);
+
+               return lpni;
+       }
+
+       if (prev->lpni_on_peer_net_list.next ==
+           &prev->lpni_peer_net->lpn_peer_nis) {
+               /*
+                * if you reached the end of the peer ni list and the peer
+                * net is specified then there are no more peer nis in that
+                * net.
+                */
+               if (net)
+                       return NULL;
+
+               /*
+                * we reached the end of this net ni list. move to the
+                * next net
+                */
+               if (prev->lpni_peer_net->lpn_on_peer_list.next ==
+                   &peer->lp_peer_nets)
+                       /* no more nets and no more NIs. */
+                       return NULL;
+
+               /* get the next net */
+               net = list_entry(prev->lpni_peer_net->lpn_on_peer_list.next,
+                                struct lnet_peer_net,
+                                lpn_on_peer_list);
+               /* get the ni on it */
+               lpni = list_entry(net->lpn_peer_nis.next, struct lnet_peer_ni,
+                                 lpni_on_peer_net_list);
+
+               return lpni;
+       }
+
+       /* there are more nis left */
+       lpni = list_entry(prev->lpni_on_peer_net_list.next,
+                         struct lnet_peer_ni, lpni_on_peer_net_list);
+
+       return lpni;
+}
+
+bool
+lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni)
+{
+       int i;
+
+       for (i = 0; i < lpni->lpni_pref_nnids; i++) {
+               if (lpni->lpni_pref_nids[i] == ni->ni_nid)
+                       return true;
+       }
+       return false;
+}
+
 static void
 lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
 {
@@ -294,6 +381,17 @@ lnet_build_peer_hierarchy(struct lnet_peer_ni *lpni)
        return 0;
 }
 
+struct lnet_peer_net *
+lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id)
+{
+       struct lnet_peer_net *peer_net;
+       list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_on_peer_list) {
+               if (peer_net->lpn_net_id == net_id)
+                       return peer_net;
+       }
+       return NULL;
+}
+
 void
 lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni)
 {
@@ -405,12 +503,19 @@ lnet_nid2peerni_locked(struct lnet_peer_ni **lpnip, lnet_nid_t nid, int cpt)
        }
 
        lpni->lpni_net = lnet_get_net_locked(LNET_NIDNET(lpni->lpni_nid));
-       lpni->lpni_txcredits    =
-       lpni->lpni_mintxcredits =
-               lpni->lpni_net->net_tunables.lct_peer_tx_credits;
-       lpni->lpni_rtrcredits    =
-       lpni->lpni_minrtrcredits =
-               lnet_peer_buffer_credits(lpni->lpni_net);
+       if (lpni->lpni_net) {
+               lpni->lpni_txcredits    =
+               lpni->lpni_mintxcredits =
+                       lpni->lpni_net->net_tunables.lct_peer_tx_credits;
+               lpni->lpni_rtrcredits    =
+               lpni->lpni_minrtrcredits =
+                       lnet_peer_buffer_credits(lpni->lpni_net);
+       } else {
+               CDEBUG(D_NET, "peer_ni %s is not directly connected\n",
+                      libcfs_nid2str(nid));
+       }
+
+       lnet_set_peer_ni_health_locked(lpni, true);
 
        list_add_tail(&lpni->lpni_hashlist,
                        &ptable->pt_hash[lnet_nid2peerhash(nid)]);