Whamcloud - gitweb
LU-9120 lnet: refactor lnet_select_pathway()
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index e0014c2..383cc82 100644 (file)
@@ -46,6 +46,23 @@ static int local_nid_dist_zero = 1;
 module_param(local_nid_dist_zero, int, 0444);
 MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
 
+struct lnet_send_data {
+       struct lnet_ni *sd_best_ni;
+       struct lnet_peer_ni *sd_best_lpni;
+       struct lnet_peer_ni *sd_final_dst_lpni;
+       struct lnet_peer *sd_peer;
+       struct lnet_peer *sd_gw_peer;
+       struct lnet_peer_ni *sd_gw_lpni;
+       struct lnet_peer_net *sd_peer_net;
+       struct lnet_msg *sd_msg;
+       lnet_nid_t sd_dst_nid;
+       lnet_nid_t sd_src_nid;
+       lnet_nid_t sd_rtr_nid;
+       int sd_cpt;
+       int sd_md_cpt;
+       __u32 sd_send_case;
+};
+
 static inline struct lnet_comm_count *
 get_stats_counts(struct lnet_element_stats *stats,
                 enum lnet_stats_type stats_type)
@@ -1369,7 +1386,7 @@ lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2)
 }
 
 static struct lnet_peer_ni *
-lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
+lnet_find_route_locked(struct lnet_net *net, __u32 remote_net,
                       lnet_nid_t rtr_nid)
 {
        struct lnet_remotenet   *rnet;
@@ -1383,7 +1400,7 @@ lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
        /* If @rtr_nid is not LNET_NID_ANY, return the gateway with
         * rtr_nid nid, otherwise find the best gateway I can use */
 
-       rnet = lnet_find_rnet_locked(LNET_NIDNET(target));
+       rnet = lnet_find_rnet_locked(remote_net);
        if (rnet == NULL)
                return NULL;
 
@@ -1428,13 +1445,21 @@ lnet_find_route_locked(struct lnet_net *net, lnet_nid_t target,
 }
 
 static struct lnet_ni *
-lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
+lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
+                struct lnet_peer *peer, struct lnet_peer_net *peer_net,
                 int md_cpt)
 {
-       struct lnet_ni *ni = NULL, *best_ni = cur_ni;
+       struct lnet_ni *ni = NULL;
        unsigned int shortest_distance;
        int best_credits;
 
+       /*
+        * If there is no peer_ni that we can send to on this network,
+        * then there is no point in looking for a new best_ni here.
+       */
+       if (!lnet_get_next_peer_ni_locked(peer, peer_net, NULL))
+               return best_ni;
+
        if (best_ni == NULL) {
                shortest_distance = UINT_MAX;
                best_credits = INT_MIN;
@@ -1462,6 +1487,12 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
                                            md_cpt,
                                            ni->ni_dev_cpt);
 
+               CDEBUG(D_NET, "compare ni %s [c:%d, d:%d, s:%d] with best_ni %s [c:%d, d:%d, s:%d]\n",
+                      libcfs_nid2str(ni->ni_nid), ni_credits, distance,
+                      ni->ni_seq, (best_ni) ? libcfs_nid2str(best_ni->ni_nid)
+                       : "not seleced", best_credits, shortest_distance,
+                       (best_ni) ? best_ni->ni_seq : 0);
+
                /*
                 * All distances smaller than the NUMA range
                 * are treated equally.
@@ -1487,6 +1518,9 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *cur_ni,
                best_credits = ni_credits;
        }
 
+       CDEBUG(D_NET, "selected best_ni %s\n",
+              (best_ni) ? libcfs_nid2str(best_ni->ni_nid) : "no selection");
+
        return best_ni;
 }
 
@@ -1511,422 +1545,150 @@ lnet_msg_discovery(struct lnet_msg *msg)
        return false;
 }
 
+#define SRC_SPEC       0x0001
+#define SRC_ANY                0x0002
+#define LOCAL_DST      0x0004
+#define REMOTE_DST     0x0008
+#define MR_DST         0x0010
+#define NMR_DST                0x0020
+#define SND_RESP       0x0040
+
+/* The following to defines are used for return codes */
+#define REPEAT_SEND    0x1000
+#define PASS_THROUGH   0x2000
+
+/* The different cases lnet_select pathway needs to handle */
+#define SRC_SPEC_LOCAL_MR_DST  (SRC_SPEC | LOCAL_DST | MR_DST)
+#define SRC_SPEC_ROUTER_MR_DST (SRC_SPEC | REMOTE_DST | MR_DST)
+#define SRC_SPEC_LOCAL_NMR_DST (SRC_SPEC | LOCAL_DST | NMR_DST)
+#define SRC_SPEC_ROUTER_NMR_DST        (SRC_SPEC | REMOTE_DST | NMR_DST)
+#define SRC_ANY_LOCAL_MR_DST   (SRC_ANY | LOCAL_DST | MR_DST)
+#define SRC_ANY_ROUTER_MR_DST  (SRC_ANY | REMOTE_DST | MR_DST)
+#define SRC_ANY_LOCAL_NMR_DST  (SRC_ANY | LOCAL_DST | NMR_DST)
+#define SRC_ANY_ROUTER_NMR_DST (SRC_ANY | REMOTE_DST | NMR_DST)
+
 static int
-lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
-                   struct lnet_msg *msg, lnet_nid_t rtr_nid)
+lnet_handle_send(struct lnet_send_data *sd)
 {
-       struct lnet_ni          *best_ni;
-       struct lnet_peer_ni     *best_lpni;
-       struct lnet_peer_ni     *best_gw;
-       struct lnet_peer_ni     *lpni;
-       struct lnet_peer_ni     *final_dst;
-       struct lnet_peer        *peer;
-       struct lnet_peer_net    *peer_net;
-       struct lnet_net         *local_net;
-       int                     cpt, cpt2, rc;
-       bool                    routing;
-       bool                    routing2;
-       bool                    ni_is_pref;
-       bool                    preferred;
-       bool                    local_found;
-       int                     best_lpni_credits;
-       int                     md_cpt;
-
-       /*
-        * get an initial CPT to use for locking. The idea here is not to
-        * serialize the calls to select_pathway, so that as many
-        * operations can run concurrently as possible. To do that we use
-        * the CPT where this call is being executed. Later on when we
-        * determine the CPT to use in lnet_message_commit, we switch the
-        * lock and check if there was any configuration change.  If none,
-        * then we proceed, if there is, then we restart the operation.
-        */
-       cpt = lnet_net_lock_current();
-
-       md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
-       if (md_cpt == CFS_CPT_ANY)
-               md_cpt = cpt;
-
-again:
-       best_ni = NULL;
-       best_lpni = NULL;
-       best_gw = NULL;
-       final_dst = NULL;
-       local_net = NULL;
-       routing = false;
-       routing2 = false;
-       local_found = false;
+       struct lnet_ni *best_ni = sd->sd_best_ni;
+       struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
+       struct lnet_peer_ni *final_dst_lpni = sd->sd_final_dst_lpni;
+       struct lnet_msg *msg = sd->sd_msg;
+       int cpt2;
+       __u32 send_case = sd->sd_send_case;
+       int rc;
+       __u32 routing = send_case & REMOTE_DST;
 
        /*
-        * lnet_nid2peerni_locked() is the path that will find an
-        * existing peer_ni, or create one and mark it as having been
-        * created due to network traffic.
+        * Increment sequence number of the selected peer so that we
+        * pick the next one in Round Robin.
         */
-       lpni = lnet_nid2peerni_locked(dst_nid, LNET_NID_ANY, cpt);
-       if (IS_ERR(lpni)) {
-               lnet_net_unlock(cpt);
-               return PTR_ERR(lpni);
-       }
+       best_lpni->lpni_seq++;
 
        /*
-        * If we're being asked to send to the loopback interface, there
-        * is no need to go through any selection. We can just shortcut
-        * the entire process and send over lolnd
+        * grab a reference on the peer_ni so it sticks around even if
+        * we need to drop and relock the lnet_net_lock below.
         */
-       if (LNET_NETTYP(LNET_NIDNET(dst_nid)) == LOLND) {
-               lnet_peer_ni_decref_locked(lpni);
-               best_ni = the_lnet.ln_loni;
-               goto send;
-       }
+       lnet_peer_ni_addref_locked(best_lpni);
 
        /*
-        * Now that we have a peer_ni, check if we want to discover
-        * the peer. Traffic to the LNET_RESERVED_PORTAL should not
-        * trigger discovery.
+        * Use lnet_cpt_of_nid() to determine the CPT used to commit the
+        * message. This ensures that we get a CPT that is correct for
+        * the NI when the NI has been restricted to a subset of all CPTs.
+        * If the selected CPT differs from the one currently locked, we
+        * must unlock and relock the lnet_net_lock(), and then check whether
+        * the configuration has changed. We don't have a hold on the best_ni
+        * yet, and it may have vanished.
         */
-       peer = lpni->lpni_peer_net->lpn_peer;
-       if (lnet_msg_discovery(msg) && !lnet_peer_is_uptodate(peer)) {
-               rc = lnet_discover_peer_locked(lpni, cpt, false);
-               if (rc) {
-                       lnet_peer_ni_decref_locked(lpni);
-                       lnet_net_unlock(cpt);
-                       return rc;
+       cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
+       if (sd->sd_cpt != cpt2) {
+               __u32 seq = lnet_get_dlc_seq_locked();
+               lnet_net_unlock(sd->sd_cpt);
+               sd->sd_cpt = cpt2;
+               lnet_net_lock(sd->sd_cpt);
+               if (seq != lnet_get_dlc_seq_locked()) {
+                       lnet_peer_ni_decref_locked(best_lpni);
+                       return REPEAT_SEND;
                }
-               /* The peer may have changed. */
-               peer = lpni->lpni_peer_net->lpn_peer;
-               /* queue message and return */
-               msg->msg_src_nid_param = src_nid;
-               msg->msg_rtr_nid_param = rtr_nid;
-               msg->msg_sending = 0;
-               list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
-               CDEBUG(D_NET, "%s pending discovery\n",
-                      libcfs_nid2str(peer->lp_primary_nid));
-               lnet_peer_ni_decref_locked(lpni);
-               lnet_net_unlock(cpt);
-
-               return LNET_DC_WAIT;
-       }
-       lnet_peer_ni_decref_locked(lpni);
-
-       /* If peer is not healthy then can not send anything to it */
-       if (!lnet_is_peer_healthy_locked(peer)) {
-               lnet_net_unlock(cpt);
-               return -EHOSTUNREACH;
        }
 
        /*
-        * STEP 1: first jab at determining best_ni
-        * if src_nid is explicitly specified, then best_ni is already
-        * pre-determiend for us. Otherwise we need to select the best
-        * one to use later on
+        * store the best_lpni in the message right away to avoid having
+        * to do the same operation under different conditions
         */
-       if (src_nid != LNET_NID_ANY) {
-               best_ni = lnet_nid2ni_locked(src_nid, cpt);
-               if (!best_ni) {
-                       lnet_net_unlock(cpt);
-                       LCONSOLE_WARN("Can't send to %s: src %s is not a "
-                                     "local nid\n", libcfs_nid2str(dst_nid),
-                                     libcfs_nid2str(src_nid));
-                       return -EINVAL;
-               }
-       }
-
-       if (msg->msg_type == LNET_MSG_REPLY ||
-           msg->msg_type == LNET_MSG_ACK ||
-           !lnet_peer_is_multi_rail(peer) ||
-           best_ni) {
-               /*
-                * for replies we want to respond on the same peer_ni we
-                * received the message on if possible. If not, then pick
-                * a peer_ni to send to
-                *
-                * if the peer is non-multi-rail then you want to send to
-                * the dst_nid provided as well.
-                *
-                * If the best_ni has already been determined, IE the
-                * src_nid has been specified, then use the
-                * destination_nid provided as well, since we're
-                * continuing a series of related messages for the same
-                * RPC.
-                *
-                * It is expected to find the lpni using dst_nid, since we
-                * created it earlier.
-                */
-               best_lpni = lnet_find_peer_ni_locked(dst_nid);
-               if (best_lpni)
-                       lnet_peer_ni_decref_locked(best_lpni);
-
-               if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) {
-                       /*
-                        * this lpni is not on a local network so we need
-                        * to route this reply.
-                        */
-                       best_gw = lnet_find_route_locked(NULL,
-                                                        best_lpni->lpni_nid,
-                                                        rtr_nid);
-                       if (best_gw) {
-                               /*
-                               * RULE: Each node considers only the next-hop
-                               *
-                               * We're going to route the message, so change the peer to
-                               * the router.
-                               */
-                               LASSERT(best_gw->lpni_peer_net);
-                               LASSERT(best_gw->lpni_peer_net->lpn_peer);
-                               peer = best_gw->lpni_peer_net->lpn_peer;
-
-                               /*
-                               * if the router is not multi-rail then use the best_gw
-                               * found to send the message to
-                               */
-                               if (!lnet_peer_is_multi_rail(peer))
-                                       best_lpni = best_gw;
-                               else
-                                       best_lpni = NULL;
-
-                               routing = true;
-                       } else {
-                               best_lpni = NULL;
-                       }
-               } else if (!best_lpni) {
-                       lnet_net_unlock(cpt);
-                       CERROR("unable to send msg_type %d to "
-                             "originating %s. Destination NID not in DB\n",
-                             msg->msg_type, libcfs_nid2str(dst_nid));
-                       return -EINVAL;
-               }
-       }
+       msg->msg_txpeer = best_lpni;
+       msg->msg_txni = best_ni;
 
        /*
-        * We must use a consistent source address when sending to a
-        * non-MR peer. However, a non-MR peer can have multiple NIDs
-        * on multiple networks, and we may even need to talk to this
-        * peer on multiple networks -- certain types of
-        * load-balancing configuration do this.
-        *
-        * So we need to pick the NI the peer prefers for this
-        * particular network.
+        * grab a reference for the best_ni since now it's in use in this
+        * send. The reference will be dropped in lnet_finalize()
         */
-       if (!lnet_peer_is_multi_rail(peer)) {
-               if (!best_lpni) {
-                       lnet_net_unlock(cpt);
-                       CERROR("no route to %s\n",
-                              libcfs_nid2str(dst_nid));
-                       return -EHOSTUNREACH;
-               }
-
-               /* best ni is already set if src_nid was provided */
-               if (!best_ni) {
-                       /* Get the target peer_ni */
-                       peer_net = lnet_peer_get_net_locked(peer,
-                                       LNET_NIDNET(best_lpni->lpni_nid));
-                       LASSERT(peer_net != NULL);
-                       list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
-                                           lpni_peer_nis) {
-                               if (lpni->lpni_pref_nnids == 0)
-                                       continue;
-                               LASSERT(lpni->lpni_pref_nnids == 1);
-                               best_ni = lnet_nid2ni_locked(
-                                               lpni->lpni_pref.nid, cpt);
-                               break;
-                       }
-               }
-               /* if best_ni is still not set just pick one */
-               if (!best_ni) {
-                       best_ni = lnet_net2ni_locked(
-                               best_lpni->lpni_net->net_id, cpt);
-                       /* If there is no best_ni we don't have a route */
-                       if (!best_ni) {
-                               CERROR("no path to %s from net %s\n",
-                                       libcfs_nid2str(best_lpni->lpni_nid),
-                                       libcfs_net2str(best_lpni->lpni_net->net_id));
-                               lnet_net_unlock(cpt);
-                               return -EHOSTUNREACH;
-                       }
-                       lpni = list_entry(peer_net->lpn_peer_nis.next,
-                                       struct lnet_peer_ni,
-                                       lpni_peer_nis);
-               }
-               /* Set preferred NI if necessary. */
-               if (lpni->lpni_pref_nnids == 0)
-                       lnet_peer_ni_set_non_mr_pref_nid(lpni, best_ni->ni_nid);
-       }
+       lnet_ni_addref_locked(msg->msg_txni, sd->sd_cpt);
 
        /*
-        * if we already found a best_ni because src_nid is specified and
-        * best_lpni because we are replying to a message then just send
-        * the message
+        * Always set the target.nid to the best peer picked. Either the
+        * NID will be one of the peer NIDs selected, or the same NID as
+        * what was originally set in the target or it will be the NID of
+        * a router if this message should be routed
         */
-       if (best_ni && best_lpni)
-               goto send;
+       msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
 
        /*
-        * If we already found a best_ni because src_nid is specified then
-        * pick the peer then send the message
+        * lnet_msg_commit assigns the correct cpt to the message, which
+        * is used to decrement the correct refcount on the ni when it's
+        * time to return the credits
         */
-       if (best_ni)
-               goto pick_peer;
+       lnet_msg_commit(msg, sd->sd_cpt);
 
        /*
-        * pick the best_ni by going through all the possible networks of
-        * that peer and see which local NI is best suited to talk to that
-        * peer.
-        *
-        * Locally connected networks will always be preferred over
-        * a routed network. If there are only routed paths to the peer,
-        * then the best route is chosen. If all routes are equal then
-        * they are used in round robin.
+        * If we are routing the message then we keep the src_nid that was
+        * set by the originator. If we are not routing then we are the
+        * originator and set it here.
         */
-       list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
-               if (!lnet_is_peer_net_healthy_locked(peer_net))
-                       continue;
-
-               local_net = lnet_get_net_locked(peer_net->lpn_net_id);
-               if (!local_net && !routing && !local_found) {
-                       struct lnet_peer_ni *net_gw;
-
-                       lpni = list_entry(peer_net->lpn_peer_nis.next,
-                                         struct lnet_peer_ni,
-                                         lpni_peer_nis);
-
-                       net_gw = lnet_find_route_locked(NULL,
-                                                       lpni->lpni_nid,
-                                                       rtr_nid);
-                       if (!net_gw)
-                               continue;
-
-                       if (best_gw) {
-                               /*
-                                * lnet_find_route_locked() call
-                                * will return the best_Gw on the
-                                * lpni->lpni_nid network.
-                                * However, best_gw and net_gw can
-                                * be on different networks.
-                                * Therefore need to compare them
-                                * to pick the better of either.
-                                */
-                               if (lnet_compare_peers(best_gw, net_gw) > 0)
-                                       continue;
-                               if (best_gw->lpni_gw_seq <= net_gw->lpni_gw_seq)
-                                       continue;
-                       }
-                       best_gw = net_gw;
-                       final_dst = lpni;
-
-                       routing2 = true;
-               } else {
-                       best_gw = NULL;
-                       final_dst = NULL;
-                       routing2 = false;
-                       local_found = true;
-               }
-
-               /*
-                * a gw on this network is found, but there could be
-                * other better gateways on other networks. So don't pick
-                * the best_ni until we determine the best_gw.
-                */
-               if (best_gw)
-                       continue;
-
-               /* if no local_net found continue */
-               if (!local_net)
-                       continue;
+       if (!msg->msg_routing)
+               msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
 
+       if (routing) {
+               msg->msg_target_is_router = 1;
+               msg->msg_target.pid = LNET_PID_LUSTRE;
                /*
-                * Iterate through the NIs in this local Net and select
-                * the NI to send from. The selection is determined by
-                * these 3 criterion in the following priority:
-                *      1. NUMA
-                *      2. NI available credits
-                *      3. Round Robin
+                * since we're routing we want to ensure that the
+                * msg_hdr.dest_nid is set to the final destination. When
+                * the router receives this message it knows how to route
+                * it.
+                *
+                * final_dst_lpni is set at the beginning of the
+                * lnet_select_pathway() function and is never changed.
+                * It's safe to use it here.
                 */
-               best_ni = lnet_get_best_ni(local_net, best_ni, md_cpt);
-       }
-
-       if (!best_ni && !best_gw) {
-               lnet_net_unlock(cpt);
-               LCONSOLE_WARN("No local ni found to send from to %s\n",
-                       libcfs_nid2str(dst_nid));
-               return -EINVAL;
-       }
-
-       if (!best_ni) {
-               best_ni = lnet_get_best_ni(best_gw->lpni_net, best_ni, md_cpt);
-               LASSERT(best_gw && best_ni);
-
+               msg->msg_hdr.dest_nid = cpu_to_le64(final_dst_lpni->lpni_nid);
+       } else {
                /*
-                * We're going to route the message, so change the peer to
-                * the router.
+                * if we're not routing set the dest_nid to the best peer
+                * ni NID that we picked earlier in the algorithm.
                 */
-               LASSERT(best_gw->lpni_peer_net);
-               LASSERT(best_gw->lpni_peer_net->lpn_peer);
-               best_gw->lpni_gw_seq++;
-               peer = best_gw->lpni_peer_net->lpn_peer;
+               msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
        }
 
-       /*
-        * Now that we selected the NI to use increment its sequence
-        * number so the Round Robin algorithm will detect that it has
-        * been used and pick the next NI.
-        */
-       best_ni->ni_seq++;
-
-pick_peer:
-       /*
-        * At this point the best_ni is on a local network on which
-        * the peer has a peer_ni as well
-        */
-       peer_net = lnet_peer_get_net_locked(peer,
-                                           best_ni->ni_net->net_id);
-       /*
-        * peer_net is not available or the src_nid is explicitly defined
-        * and the peer_net for that src_nid is unhealthy. find a route to
-        * the destination nid.
-        */
-       if (!peer_net ||
-           (src_nid != LNET_NID_ANY &&
-            !lnet_is_peer_net_healthy_locked(peer_net))) {
-               best_gw = lnet_find_route_locked(best_ni->ni_net,
-                                                dst_nid,
-                                                rtr_nid);
-               /*
-                * if no route is found for that network then
-                * move onto the next peer_ni in the peer
-                */
-               if (!best_gw) {
-                       LCONSOLE_WARN("No route to peer from %s\n",
-                               libcfs_nid2str(best_ni->ni_nid));
-                       lnet_net_unlock(cpt);
-                       return -EHOSTUNREACH;
-               }
+       rc = lnet_post_send_locked(msg, 0);
 
-               CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
-                       libcfs_nid2str(dst_nid),
-                       libcfs_nid2str(best_gw->lpni_nid),
-                       lnet_msgtyp2str(msg->msg_type), msg->msg_len);
+       if (!rc)
+               CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s\n",
+                      libcfs_nid2str(msg->msg_hdr.src_nid),
+                      libcfs_nid2str(msg->msg_txni->ni_nid),
+                      libcfs_nid2str(sd->sd_src_nid),
+                      libcfs_nid2str(msg->msg_hdr.dest_nid),
+                      libcfs_nid2str(sd->sd_dst_nid),
+                      libcfs_nid2str(msg->msg_txpeer->lpni_nid),
+                      lnet_msgtyp2str(msg->msg_type));
 
-               routing2 = true;
-               /*
-                * RULE: Each node considers only the next-hop
-                *
-                * We're going to route the message, so change the peer to
-                * the router.
-                */
-               LASSERT(best_gw->lpni_peer_net);
-               LASSERT(best_gw->lpni_peer_net->lpn_peer);
-               peer = best_gw->lpni_peer_net->lpn_peer;
-       } else if (!lnet_is_peer_net_healthy_locked(peer_net)) {
-               /*
-                * this peer_net is unhealthy but we still have an opportunity
-                * to find another peer_net that we can use
-                */
-               __u32 net_id = peer_net->lpn_net_id;
-               LCONSOLE_WARN("peer net %s unhealthy\n",
-                             libcfs_net2str(net_id));
-               goto again;
-       }
+       return rc;
+}
 
+static struct lnet_peer_ni *
+lnet_select_peer_ni(struct lnet_send_data *sd, struct lnet_peer *peer,
+                   struct lnet_peer_net *peer_net)
+{
        /*
         * Look at the peer NIs for the destination peer that connect
         * to the chosen net. If a peer_ni is preferred when using the
@@ -1935,20 +1697,31 @@ pick_peer:
         * the available transmit credits are used. If the transmit
         * credits are equal, we round-robin over the peer_ni.
         */
-       lpni = NULL;
-       best_lpni_credits = INT_MIN;
-       preferred = false;
-       best_lpni = NULL;
+       struct lnet_peer_ni *lpni = NULL;
+       struct lnet_peer_ni *best_lpni = NULL;
+       struct lnet_ni *best_ni = sd->sd_best_ni;
+       lnet_nid_t dst_nid = sd->sd_dst_nid;
+       int best_lpni_credits = INT_MIN;
+       bool preferred = false;
+       bool ni_is_pref;
+
        while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
                /*
-                * if this peer ni is not healthy just skip it, no point in
-                * examining it further
+                * if the best_ni we've chosen aleady has this lpni
+                * preferred, then let's use it
                 */
-               if (!lnet_is_peer_ni_healthy_locked(lpni))
-                       continue;
                ni_is_pref = lnet_peer_is_pref_nid_locked(lpni,
                                                          best_ni->ni_nid);
 
+               CDEBUG(D_NET, "%s ni_is_pref = %d\n",
+                      libcfs_nid2str(best_ni->ni_nid), ni_is_pref);
+
+               if (best_lpni)
+                       CDEBUG(D_NET, "%s c:[%d, %d], s:[%d, %d]\n",
+                               libcfs_nid2str(lpni->lpni_nid),
+                               lpni->lpni_txcredits, best_lpni_credits,
+                               lpni->lpni_seq, best_lpni->lpni_seq);
+
                /* if this is a preferred peer use it */
                if (!preferred && ni_is_pref) {
                        preferred = true;
@@ -1986,133 +1759,812 @@ pick_peer:
        if (!best_lpni) {
                __u32 net_id = (peer_net) ? peer_net->lpn_net_id :
                        LNET_NIDNET(dst_nid);
-               lnet_net_unlock(cpt);
-               LCONSOLE_WARN("no peer_ni found on peer net %s\n",
+               CDEBUG(D_NET, "no peer_ni found on peer net %s\n",
                                libcfs_net2str(net_id));
-               return -EHOSTUNREACH;
+               return NULL;
        }
 
+       CDEBUG(D_NET, "sd_best_lpni = %s\n",
+              libcfs_nid2str(best_lpni->lpni_nid));
 
-send:
-       /* Shortcut for loopback. */
-       if (best_ni == the_lnet.ln_loni) {
-               /* No send credit hassles with LOLND */
-               lnet_ni_addref_locked(best_ni, cpt);
-               msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
-               if (!msg->msg_routing)
-                       msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
-               msg->msg_target.nid = best_ni->ni_nid;
-               lnet_msg_commit(msg, cpt);
-               msg->msg_txni = best_ni;
-               lnet_net_unlock(cpt);
-
-               return LNET_CREDIT_OK;
-       }
+       return best_lpni;
+}
 
-       routing = routing || routing2;
+/*
+ * Prerequisite: the best_ni should already be set in the sd
+ */
+static inline struct lnet_peer_ni *
+lnet_find_best_lpni_on_net(struct lnet_send_data *sd, struct lnet_peer *peer,
+                          __u32 net_id)
+{
+       struct lnet_peer_net *peer_net;
 
        /*
-        * Increment sequence number of the peer selected so that we
-        * pick the next one in Round Robin.
+        * The gateway is Multi-Rail capable so now we must select the
+        * proper peer_ni
         */
-       best_lpni->lpni_seq++;
+       peer_net = lnet_peer_get_net_locked(peer, net_id);
+
+       if (!peer_net) {
+               CERROR("gateway peer %s has no NI on net %s\n",
+                      libcfs_nid2str(peer->lp_primary_nid),
+                      libcfs_net2str(net_id));
+               return NULL;
+       }
+
+       return lnet_select_peer_ni(sd, peer, peer_net);
+}
+
+static inline void
+lnet_set_non_mr_pref_nid(struct lnet_send_data *sd)
+{
+       if (sd->sd_send_case & NMR_DST &&
+           sd->sd_msg->msg_type != LNET_MSG_REPLY &&
+           sd->sd_msg->msg_type != LNET_MSG_ACK &&
+           sd->sd_best_lpni->lpni_pref_nnids == 0) {
+               CDEBUG(D_NET, "Setting preferred local NID %s on NMR peer %s\n",
+                      libcfs_nid2str(sd->sd_best_ni->ni_nid),
+                      libcfs_nid2str(sd->sd_best_lpni->lpni_nid));
+               lnet_peer_ni_set_non_mr_pref_nid(sd->sd_best_lpni,
+                                                sd->sd_best_ni->ni_nid);
+       }
+}
+
+/*
+ * Source Specified
+ * Local Destination
+ * non-mr peer
+ *
+ * use the source and destination NIDs as the pathway
+ */
+static int
+lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd)
+{
+       /* the destination lpni is set before we get here. */
+
+       /* find local NI */
+       sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
+       if (!sd->sd_best_ni) {
+               CERROR("Can't send to %s: src %s is not a "
+                      "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
+                               libcfs_nid2str(sd->sd_src_nid));
+               return -EINVAL;
+       }
 
        /*
-        * grab a reference on the peer_ni so it sticks around even if
-        * we need to drop and relock the lnet_net_lock below.
+        * the preferred NID will only be set for NMR peers
         */
-       lnet_peer_ni_addref_locked(best_lpni);
+       lnet_set_non_mr_pref_nid(sd);
+
+       return lnet_handle_send(sd);
+}
+
+/*
+ * Source Specified
+ * Local Destination
+ * MR Peer
+ *
+ * Run the selection algorithm on the peer NIs unless we're sending
+ * a response, in this case just send to the destination
+ */
+static int
+lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd)
+{
+       sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
+       if (!sd->sd_best_ni) {
+               CERROR("Can't send to %s: src %s is not a "
+                      "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
+                               libcfs_nid2str(sd->sd_src_nid));
+               return -EINVAL;
+       }
 
        /*
-        * Use lnet_cpt_of_nid() to determine the CPT used to commit the
-        * message. This ensures that we get a CPT that is correct for
-        * the NI when the NI has been restricted to a subset of all CPTs.
-        * If the selected CPT differs from the one currently locked, we
-        * must unlock and relock the lnet_net_lock(), and then check whether
-        * the configuration has changed. We don't have a hold on the best_ni
-        * yet, and it may have vanished.
+        * only run the selection algorithm to pick the peer_ni if we're
+        * sending a GET or a PUT. Responses are sent to the same
+        * destination NID provided.
         */
-       cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
-       if (cpt != cpt2) {
-               __u32 seq = lnet_get_dlc_seq_locked();
-               lnet_net_unlock(cpt);
-               cpt = cpt2;
-               lnet_net_lock(cpt);
-               if (seq != lnet_get_dlc_seq_locked()) {
-                       lnet_peer_ni_decref_locked(best_lpni);
-                       goto again;
+       if (!(sd->sd_send_case & SND_RESP)) {
+               sd->sd_best_lpni =
+                 lnet_find_best_lpni_on_net(sd, sd->sd_peer,
+                                            sd->sd_best_ni->ni_net->net_id);
+       }
+
+       if (sd->sd_best_lpni)
+               return lnet_handle_send(sd);
+
+       CERROR("can't send to %s. no NI on %s\n",
+              libcfs_nid2str(sd->sd_dst_nid),
+              libcfs_net2str(sd->sd_best_ni->ni_net->net_id));
+
+       return -EHOSTUNREACH;
+}
+
+struct lnet_ni *
+lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
+                             struct lnet_peer *peer,
+                             struct lnet_peer_net *peer_net,
+                             int cpt,
+                             bool incr_seq)
+{
+       struct lnet_net *local_net;
+       struct lnet_ni *best_ni;
+
+       local_net = lnet_get_net_locked(peer_net->lpn_net_id);
+       if (!local_net)
+               return NULL;
+
+       /*
+        * Iterate through the NIs in this local Net and select
+        * the NI to send from. The selection is determined by
+        * these 3 criterion in the following priority:
+        *      1. NUMA
+        *      2. NI available credits
+        *      3. Round Robin
+        */
+       best_ni = lnet_get_best_ni(local_net, cur_best_ni,
+                                  peer, peer_net, cpt);
+
+       if (incr_seq && best_ni)
+               best_ni->ni_seq++;
+
+       return best_ni;
+}
+
+static int
+lnet_handle_find_routed_path(struct lnet_send_data *sd,
+                            lnet_nid_t dst_nid,
+                            struct lnet_peer_ni **gw_lpni,
+                            struct lnet_peer **gw_peer)
+{
+       struct lnet_peer_ni *gw;
+       lnet_nid_t src_nid = sd->sd_src_nid;
+
+       gw = lnet_find_route_locked(NULL, LNET_NIDNET(dst_nid),
+                                   sd->sd_rtr_nid);
+       if (!gw) {
+               CERROR("no route to %s from %s\n",
+                      libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid));
+               return -EHOSTUNREACH;
+       }
+
+       /* get the peer of the gw_ni */
+       LASSERT(gw->lpni_peer_net);
+       LASSERT(gw->lpni_peer_net->lpn_peer);
+
+       *gw_peer = gw->lpni_peer_net->lpn_peer;
+
+       if (!sd->sd_best_ni)
+               sd->sd_best_ni = lnet_find_best_ni_on_spec_net(NULL, *gw_peer,
+                                       gw->lpni_peer_net,
+                                       sd->sd_md_cpt,
+                                       true);
+
+       if (!sd->sd_best_ni) {
+               CERROR("Internal Error. Expected local ni on %s "
+                      "but non found :%s\n",
+                      libcfs_net2str(gw->lpni_peer_net->lpn_net_id),
+                      libcfs_nid2str(sd->sd_src_nid));
+               return -EFAULT;
+       }
+
+       /*
+        * if gw is MR let's find its best peer_ni
+        */
+       if (lnet_peer_is_multi_rail(*gw_peer)) {
+               gw = lnet_find_best_lpni_on_net(sd, *gw_peer,
+                                               sd->sd_best_ni->ni_net->net_id);
+               /*
+                * We've already verified that the gw has an NI on that
+                * desired net, but we're not finding it. Something is
+                * wrong.
+                */
+               if (!gw) {
+                       CERROR("Internal Error. Route expected to %s from %s\n",
+                               libcfs_nid2str(dst_nid),
+                               libcfs_nid2str(src_nid));
+                       return -EFAULT;
                }
        }
 
+       *gw_lpni = gw;
+
+       return 0;
+}
+
+/*
+ * Handle two cases:
+ *
+ * Case 1:
+ *  Source specified
+ *  Remote destination
+ *  Non-MR destination
+ *
+ * Case 2:
+ *  Source specified
+ *  Remote destination
+ *  MR destination
+ *
+ * The handling of these two cases is similar. Even though the destination
+ * can be MR or non-MR, we'll deal directly with the router.
+ */
+static int
+lnet_handle_spec_router_dst(struct lnet_send_data *sd)
+{
+       int rc;
+       struct lnet_peer_ni *gw_lpni = NULL;
+       struct lnet_peer *gw_peer = NULL;
+
+       /* find local NI */
+       sd->sd_best_ni = lnet_nid2ni_locked(sd->sd_src_nid, sd->sd_cpt);
+       if (!sd->sd_best_ni) {
+               CERROR("Can't send to %s: src %s is not a "
+                      "local nid\n", libcfs_nid2str(sd->sd_dst_nid),
+                               libcfs_nid2str(sd->sd_src_nid));
+               return -EINVAL;
+       }
+
+       rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
+                                    &gw_peer);
+       if (rc < 0)
+               return rc;
+
+       if (sd->sd_send_case & NMR_DST)
+               /*
+               * since the final destination is non-MR let's set its preferred
+               * NID before we send
+               */
+               lnet_set_non_mr_pref_nid(sd);
+
        /*
-        * store the best_lpni in the message right away to avoid having
-        * to do the same operation under different conditions
+        * We're going to send to the gw found so let's set its
+        * info
         */
-       msg->msg_txpeer = best_lpni;
-       msg->msg_txni = best_ni;
+       sd->sd_peer = gw_peer;
+       sd->sd_best_lpni = gw_lpni;
+
+       return lnet_handle_send(sd);
+}
+
+struct lnet_ni *
+lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt)
+{
+       struct lnet_peer_net *peer_net = NULL;
+       struct lnet_ni *best_ni = NULL;
 
        /*
-        * grab a reference for the best_ni since now it's in use in this
-        * send. the reference will need to be dropped when the message is
-        * finished in lnet_finalize()
+        * The peer can have multiple interfaces, some of them can be on
+        * the local network and others on a routed network. We should
+        * prefer the local network. However if the local network is not
+        * available then we need to try the routed network
+        */
+
+       /* go through all the peer nets and find the best_ni */
+       list_for_each_entry(peer_net, &peer->lp_peer_nets, lpn_peer_nets) {
+               /*
+                * The peer's list of nets can contain non-local nets. We
+                * want to only examine the local ones.
+                */
+               if (!lnet_get_net_locked(peer_net->lpn_net_id))
+                       continue;
+               best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer,
+                                                  peer_net, md_cpt, false);
+       }
+
+       if (best_ni)
+               /* increment sequence number so we can round robin */
+               best_ni->ni_seq++;
+
+       return best_ni;
+}
+
+static struct lnet_ni *
+lnet_find_existing_preferred_best_ni(struct lnet_send_data *sd)
+{
+       struct lnet_ni *best_ni = NULL;
+       struct lnet_peer_net *peer_net;
+       struct lnet_peer *peer = sd->sd_peer;
+       struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
+       struct lnet_peer_ni *lpni;
+       int cpt = sd->sd_cpt;
+
+       /*
+        * We must use a consistent source address when sending to a
+        * non-MR peer. However, a non-MR peer can have multiple NIDs
+        * on multiple networks, and we may even need to talk to this
+        * peer on multiple networks -- certain types of
+        * load-balancing configuration do this.
+        *
+        * So we need to pick the NI the peer prefers for this
+        * particular network.
         */
-       lnet_ni_addref_locked(msg->msg_txni, cpt);
+
+       /* Get the target peer_ni */
+       peer_net = lnet_peer_get_net_locked(peer,
+                       LNET_NIDNET(best_lpni->lpni_nid));
+       LASSERT(peer_net != NULL);
+       list_for_each_entry(lpni, &peer_net->lpn_peer_nis,
+                               lpni_peer_nis) {
+               if (lpni->lpni_pref_nnids == 0)
+                       continue;
+               LASSERT(lpni->lpni_pref_nnids == 1);
+               best_ni = lnet_nid2ni_locked(
+                               lpni->lpni_pref.nid, cpt);
+               break;
+       }
+
+       return best_ni;
+}
+
+/* Prerequisite: sd->sd_peer and sd->sd_best_lpni should be set */
+static int
+lnet_select_preferred_best_ni(struct lnet_send_data *sd)
+{
+       struct lnet_ni *best_ni = NULL;
+       struct lnet_peer_ni *best_lpni = sd->sd_best_lpni;
 
        /*
-        * Always set the target.nid to the best peer picked. Either the
-        * nid will be one of the preconfigured NIDs, or the same NID as
-        * what was originally set in the target or it will be the NID of
-        * a router if this message should be routed
+        * We must use a consistent source address when sending to a
+        * non-MR peer. However, a non-MR peer can have multiple NIDs
+        * on multiple networks, and we may even need to talk to this
+        * peer on multiple networks -- certain types of
+        * load-balancing configuration do this.
+        *
+        * So we need to pick the NI the peer prefers for this
+        * particular network.
         */
-       msg->msg_target.nid = msg->msg_txpeer->lpni_nid;
+
+       best_ni = lnet_find_existing_preferred_best_ni(sd);
+
+       /* if best_ni is still not set just pick one */
+       if (!best_ni) {
+               best_ni =
+                 lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
+                                               sd->sd_best_lpni->lpni_peer_net,
+                                               sd->sd_md_cpt, true);
+               /* If there is no best_ni we don't have a route */
+               if (!best_ni) {
+                       CERROR("no path to %s from net %s\n",
+                               libcfs_nid2str(best_lpni->lpni_nid),
+                               libcfs_net2str(best_lpni->lpni_net->net_id));
+                       return -EHOSTUNREACH;
+               }
+       }
+
+       sd->sd_best_ni = best_ni;
+
+       /* Set preferred NI if necessary. */
+       lnet_set_non_mr_pref_nid(sd);
+
+       return 0;
+}
+
+
+/*
+ * Source not specified
+ * Local destination
+ * Non-MR Peer
+ *
+ * always use the same source NID for NMR peers
+ * If we've talked to that peer before then we already have a preferred
+ * source NI associated with it. Otherwise, we select a preferred local NI
+ * and store it in the peer
+ */
+static int
+lnet_handle_any_local_nmr_dst(struct lnet_send_data *sd)
+{
+       int rc;
+
+       /* sd->sd_best_lpni is already set to the final destination */
 
        /*
-        * lnet_msg_commit assigns the correct cpt to the message, which
-        * is used to decrement the correct refcount on the ni when it's
-        * time to return the credits
+        * At this point we should've created the peer ni and peer. If we
+        * can't find it, then something went wrong. Instead of assert
+        * output a relevant message and fail the send
         */
-       lnet_msg_commit(msg, cpt);
+       if (!sd->sd_best_lpni) {
+               CERROR("Internal fault. Unable to send msg %s to %s. "
+                      "NID not known\n",
+                      lnet_msgtyp2str(sd->sd_msg->msg_type),
+                      libcfs_nid2str(sd->sd_dst_nid));
+               return -EFAULT;
+       }
+
+       rc = lnet_select_preferred_best_ni(sd);
+       if (!rc)
+               rc = lnet_handle_send(sd);
 
+       return rc;
+}
+
+static int
+lnet_handle_any_mr_dsta(struct lnet_send_data *sd)
+{
        /*
-        * If we are routing the message then we don't need to overwrite
-        * the src_nid since it would've been set at the origin. Otherwise
-        * we are the originator so we need to set it.
+        * NOTE we've already handled the remote peer case. So we only
+        * need to worry about the local case here.
+        *
+        * if we're sending a response, ACK or reply, we need to send it
+        * to the destination NID given to us. At this point we already
+        * have the peer_ni we're suppose to send to, so just find the
+        * best_ni on the peer net and use that. Since we're sending to an
+        * MR peer then we can just run the selection algorithm on our
+        * local NIs and pick the best one.
         */
-       if (!msg->msg_routing)
-               msg->msg_hdr.src_nid = cpu_to_le64(msg->msg_txni->ni_nid);
+       if (sd->sd_send_case & SND_RESP) {
+               sd->sd_best_ni =
+                 lnet_find_best_ni_on_spec_net(NULL, sd->sd_peer,
+                                               sd->sd_best_lpni->lpni_peer_net,
+                                               sd->sd_md_cpt, true);
+
+               if (!sd->sd_best_ni) {
+                       /*
+                        * We're not going to deal with not able to send
+                        * a response to the provided final destination
+                        */
+                       CERROR("Can't send response to %s. "
+                              "No local NI available\n",
+                               libcfs_nid2str(sd->sd_dst_nid));
+                       return -EHOSTUNREACH;
+               }
+
+               return lnet_handle_send(sd);
+       }
+
+       /*
+        * If we get here that means we're sending a fresh request, PUT or
+        * GET, so we need to run our standard selection algorithm.
+        * First find the best local interface that's on any of the peer's
+        * networks.
+        */
+       sd->sd_best_ni = lnet_find_best_ni_on_local_net(sd->sd_peer,
+                                                       sd->sd_md_cpt);
+       if (sd->sd_best_ni) {
+               sd->sd_best_lpni =
+                 lnet_find_best_lpni_on_net(sd, sd->sd_peer,
+                                            sd->sd_best_ni->ni_net->net_id);
 
-       if (routing) {
-               msg->msg_target_is_router = 1;
-               msg->msg_target.pid = LNET_PID_LUSTRE;
-               /*
-                * since we're routing we want to ensure that the
-                * msg_hdr.dest_nid is set to the final destination. When
-                * the router receives this message it knows how to route
-                * it.
-                */
-               msg->msg_hdr.dest_nid =
-                       cpu_to_le64(final_dst ? final_dst->lpni_nid : dst_nid);
-       } else {
                /*
-                * if we're not routing set the dest_nid to the best peer
-                * ni that we picked earlier in the algorithm.
+                * if we're successful in selecting a peer_ni on the local
+                * network, then send to it. Otherwise fall through and
+                * try and see if we can reach it over another routed
+                * network
                 */
-               msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
+               if (sd->sd_best_lpni) {
+                       /*
+                        * in case we initially started with a routed
+                        * destination, let's reset to local
+                        */
+                       sd->sd_send_case &= ~REMOTE_DST;
+                       sd->sd_send_case |= LOCAL_DST;
+                       return lnet_handle_send(sd);
+               }
+
+               CERROR("Internal Error. Expected to have a best_lpni: "
+                      "%s -> %s\n",
+                      libcfs_nid2str(sd->sd_src_nid),
+                      libcfs_nid2str(sd->sd_dst_nid));
+
+               return -EFAULT;
        }
 
-       rc = lnet_post_send_locked(msg, 0);
+       /*
+        * Peer doesn't have a local network. Let's see if there is
+        * a remote network we can reach it on.
+        */
+       return PASS_THROUGH;
+}
 
-       if (!rc)
-               CDEBUG(D_NET, "TRACE: %s(%s:%s) -> %s(%s:%s) : %s\n",
-                      libcfs_nid2str(msg->msg_hdr.src_nid),
-                      libcfs_nid2str(msg->msg_txni->ni_nid),
-                      libcfs_nid2str(src_nid),
-                      libcfs_nid2str(msg->msg_hdr.dest_nid),
-                      libcfs_nid2str(dst_nid),
-                      libcfs_nid2str(msg->msg_txpeer->lpni_nid),
-                      lnet_msgtyp2str(msg->msg_type));
+/*
+ * Case 1:
+ *     Source NID not specified
+ *     Local destination
+ *     MR peer
+ *
+ * Case 2:
+ *     Source NID not speified
+ *     Remote destination
+ *     MR peer
+ *
+ * In both of these cases if we're sending a response, ACK or REPLY, then
+ * we need to send to the destination NID provided.
+ *
+ * In the remote case let's deal with MR routers.
+ *
+ */
 
-       lnet_net_unlock(cpt);
+static int
+lnet_handle_any_mr_dst(struct lnet_send_data *sd)
+{
+       int rc = 0;
+       struct lnet_peer *gw_peer = NULL;
+       struct lnet_peer_ni *gw_lpni = NULL;
+
+       /*
+        * handle sending a response to a remote peer here so we don't
+        * have to worry about it if we hit lnet_handle_any_mr_dsta()
+        */
+       if (sd->sd_send_case & REMOTE_DST &&
+           sd->sd_send_case & SND_RESP) {
+               struct lnet_peer_ni *gw;
+               struct lnet_peer *gw_peer;
+
+               rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw,
+                                                 &gw_peer);
+               if (rc < 0) {
+                       CERROR("Can't send response to %s. "
+                              "No route available\n",
+                               libcfs_nid2str(sd->sd_dst_nid));
+                       return -EHOSTUNREACH;
+               }
+
+               sd->sd_best_lpni = gw;
+               sd->sd_peer = gw_peer;
+
+               return lnet_handle_send(sd);
+       }
+
+       /*
+        * Even though the NID for the peer might not be on a local network,
+        * since the peer is MR there could be other interfaces on the
+        * local network. In that case we'd still like to prefer the local
+        * network over the routed network. If we're unable to do that
+        * then we select the best router among the different routed networks,
+        * and if the router is MR then we can deal with it as such.
+        */
+       rc = lnet_handle_any_mr_dsta(sd);
+       if (rc != PASS_THROUGH)
+               return rc;
+
+       /*
+        * TODO; One possible enhancement is to run the selection
+        * algorithm on the peer. However for remote peers the credits are
+        * not decremented, so we'll be basically going over the peer NIs
+        * in round robin. An MR router will run the selection algorithm
+        * on the next-hop interfaces.
+        */
+       rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
+                                         &gw_peer);
+       if (rc < 0)
+               return rc;
+
+       sd->sd_send_case &= ~LOCAL_DST;
+       sd->sd_send_case |= REMOTE_DST;
+
+       sd->sd_peer = gw_peer;
+       sd->sd_best_lpni = gw_lpni;
+
+       return lnet_handle_send(sd);
+}
+
+/*
+ * Source not specified
+ * Remote destination
+ * Non-MR peer
+ *
+ * Must send to the specified peer NID using the same source NID that
+ * we've used before. If it's the first time to talk to that peer then
+ * find the source NI and assign it as preferred to that peer
+ */
+static int
+lnet_handle_any_router_nmr_dst(struct lnet_send_data *sd)
+{
+       int rc;
+       struct lnet_peer_ni *gw_lpni = NULL;
+       struct lnet_peer *gw_peer = NULL;
+
+       /*
+        * Let's set if we have a preferred NI to talk to this NMR peer
+        */
+       sd->sd_best_ni = lnet_find_existing_preferred_best_ni(sd);
+
+       /*
+        * find the router and that'll find the best NI if we didn't find
+        * it already.
+        */
+       rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
+                                         &gw_peer);
+       if (rc < 0)
+               return rc;
+
+       /*
+        * set the best_ni we've chosen as the preferred one for
+        * this peer
+        */
+       lnet_set_non_mr_pref_nid(sd);
+
+       /* we'll be sending to the gw */
+       sd->sd_best_lpni = gw_lpni;
+       sd->sd_peer = gw_peer;
+
+       return lnet_handle_send(sd);
+}
+
+static int
+lnet_handle_send_case_locked(struct lnet_send_data *sd)
+{
+       /*
+        * turn off the SND_RESP bit.
+        * It will be checked in the case handling
+        */
+       __u32 send_case = sd->sd_send_case &= ~SND_RESP ;
+
+       CDEBUG(D_NET, "Source %s%s to %s %s %s destination\n",
+               (send_case & SRC_SPEC) ? "Specified: " : "ANY",
+               (send_case & SRC_SPEC) ? libcfs_nid2str(sd->sd_src_nid) : "",
+               (send_case & MR_DST) ? "MR: " : "NMR: ",
+               libcfs_nid2str(sd->sd_dst_nid),
+               (send_case & LOCAL_DST) ? "local" : "routed");
+
+       switch (send_case) {
+       /*
+        * For all cases where the source is specified, we should always
+        * use the destination NID, whether it's an MR destination or not,
+        * since we're continuing a series of related messages for the
+        * same RPC
+        */
+       case SRC_SPEC_LOCAL_NMR_DST:
+               return lnet_handle_spec_local_nmr_dst(sd);
+       case SRC_SPEC_LOCAL_MR_DST:
+               return lnet_handle_spec_local_mr_dst(sd);
+       case SRC_SPEC_ROUTER_NMR_DST:
+       case SRC_SPEC_ROUTER_MR_DST:
+               return lnet_handle_spec_router_dst(sd);
+       case SRC_ANY_LOCAL_NMR_DST:
+               return lnet_handle_any_local_nmr_dst(sd);
+       case SRC_ANY_LOCAL_MR_DST:
+       case SRC_ANY_ROUTER_MR_DST:
+               return lnet_handle_any_mr_dst(sd);
+       case SRC_ANY_ROUTER_NMR_DST:
+               return lnet_handle_any_router_nmr_dst(sd);
+       default:
+               CERROR("Unknown send case\n");
+               return -1;
+       }
+}
+
+static int
+lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
+                   struct lnet_msg *msg, lnet_nid_t rtr_nid)
+{
+       struct lnet_peer_ni     *lpni;
+       struct lnet_peer        *peer;
+       struct lnet_send_data   send_data;
+       int                     cpt, rc;
+       int                     md_cpt;
+       __u32                   send_case = 0;
+
+       memset(&send_data, 0, sizeof(send_data));
+
+       /*
+        * get an initial CPT to use for locking. The idea here is not to
+        * serialize the calls to select_pathway, so that as many
+        * operations can run concurrently as possible. To do that we use
+        * the CPT where this call is being executed. Later on when we
+        * determine the CPT to use in lnet_message_commit, we switch the
+        * lock and check if there was any configuration change.  If none,
+        * then we proceed, if there is, then we restart the operation.
+        */
+       cpt = lnet_net_lock_current();
+
+       md_cpt = lnet_cpt_of_md(msg->msg_md, msg->msg_offset);
+       if (md_cpt == CFS_CPT_ANY)
+               md_cpt = cpt;
+
+again:
+
+       /*
+        * If we're being asked to send to the loopback interface, there
+        * is no need to go through any selection. We can just shortcut
+        * the entire process and send over lolnd
+        */
+       if (LNET_NETTYP(LNET_NIDNET(dst_nid)) == LOLND) {
+               /* No send credit hassles with LOLND */
+               lnet_ni_addref_locked(the_lnet.ln_loni, cpt);
+               msg->msg_hdr.dest_nid = cpu_to_le64(the_lnet.ln_loni->ni_nid);
+               if (!msg->msg_routing)
+                       msg->msg_hdr.src_nid =
+                               cpu_to_le64(the_lnet.ln_loni->ni_nid);
+               msg->msg_target.nid = the_lnet.ln_loni->ni_nid;
+               lnet_msg_commit(msg, cpt);
+               msg->msg_txni = the_lnet.ln_loni;
+               lnet_net_unlock(cpt);
+
+               return LNET_CREDIT_OK;
+       }
+
+       /*
+        * find an existing peer_ni, or create one and mark it as having been
+        * created due to network traffic. This call will create the
+        * peer->peer_net->peer_ni tree.
+        */
+       lpni = lnet_nid2peerni_locked(dst_nid, LNET_NID_ANY, cpt);
+       if (IS_ERR(lpni)) {
+               lnet_net_unlock(cpt);
+               return PTR_ERR(lpni);
+       }
+
+       /*
+        * Now that we have a peer_ni, check if we want to discover
+        * the peer. Traffic to the LNET_RESERVED_PORTAL should not
+        * trigger discovery.
+        */
+       peer = lpni->lpni_peer_net->lpn_peer;
+       if (lnet_msg_discovery(msg) && !lnet_peer_is_uptodate(peer)) {
+               lnet_nid_t primary_nid;
+               rc = lnet_discover_peer_locked(lpni, cpt, false);
+               if (rc) {
+                       lnet_peer_ni_decref_locked(lpni);
+                       lnet_net_unlock(cpt);
+                       return rc;
+               }
+               /* The peer may have changed. */
+               peer = lpni->lpni_peer_net->lpn_peer;
+               /* queue message and return */
+               msg->msg_src_nid_param = src_nid;
+               msg->msg_rtr_nid_param = rtr_nid;
+               msg->msg_sending = 0;
+               list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
+               lnet_peer_ni_decref_locked(lpni);
+               primary_nid = peer->lp_primary_nid;
+               lnet_net_unlock(cpt);
+
+               CDEBUG(D_NET, "%s pending discovery\n",
+                      libcfs_nid2str(primary_nid));
+
+               return LNET_DC_WAIT;
+       }
+       lnet_peer_ni_decref_locked(lpni);
+
+       /* If peer is not healthy then can not send anything to it */
+       if (!lnet_is_peer_healthy_locked(peer)) {
+               lnet_net_unlock(cpt);
+               return -EHOSTUNREACH;
+       }
+
+       /*
+        * Identify the different send cases
+        */
+       if (src_nid == LNET_NID_ANY)
+               send_case |= SRC_ANY;
+       else
+               send_case |= SRC_SPEC;
+
+       if (lnet_get_net_locked(LNET_NIDNET(dst_nid)))
+               send_case |= LOCAL_DST;
+       else
+               send_case |= REMOTE_DST;
+
+       if (!lnet_peer_is_multi_rail(peer))
+               send_case |= NMR_DST;
+       else
+               send_case |= MR_DST;
+
+       if (msg->msg_type == LNET_MSG_REPLY ||
+           msg->msg_type == LNET_MSG_ACK)
+               send_case |= SND_RESP;
+
+       /* assign parameters to the send_data */
+       send_data.sd_msg = msg;
+       send_data.sd_rtr_nid = rtr_nid;
+       send_data.sd_src_nid = src_nid;
+       send_data.sd_dst_nid = dst_nid;
+       send_data.sd_best_lpni = lpni;
+       /*
+        * keep a pointer to the final destination in case we're going to
+        * route, so we'll need to access it later
+        */
+       send_data.sd_final_dst_lpni = lpni;
+       send_data.sd_peer = peer;
+       send_data.sd_md_cpt = md_cpt;
+       send_data.sd_cpt = cpt;
+       send_data.sd_send_case = send_case;
+
+       rc = lnet_handle_send_case_locked(&send_data);
+
+       if (rc == REPEAT_SEND)
+               goto again;
+
+       lnet_net_unlock(send_data.sd_cpt);
 
        return rc;
 }