Whamcloud - gitweb
LU-7734 lnet: Routing fixes part 2
authorAmir Shehata <amir.shehata@intel.com>
Wed, 6 Jul 2016 02:36:08 +0000 (19:36 -0700)
committerAmir Shehata <amir.shehata@intel.com>
Wed, 25 Jan 2017 03:10:16 +0000 (19:10 -0800)
Fix lnet_select_pathway() to handle the routing cases correctly.
The following general cases are handled:
. Non-MR directly connected
. Non-MR not directly connected
. MR Directly connected
. MR Not directly connected
  . No gateway
  . Gateway is non-mr
  . Gateway is mr

Signed-off-by: Amir Shehata <amir.shehata@intel.com>
Change-Id: If2d16b797b94421e78a9f2a254a250a440f8b244
Reviewed-on: http://review.whamcloud.com/21167

lnet/lnet/lib-move.c
lnet/lnet/peer.c

index c98ec94..6a617d5 100644 (file)
@@ -1325,6 +1325,7 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
        __u32                   seq;
        int                     cpt, cpt2, rc;
        bool                    routing;
+       bool                    routing2;
        bool                    ni_is_pref;
        bool                    preferred;
        int                     best_credits;
@@ -1348,6 +1349,7 @@ again:
        best_gw = NULL;
        local_net = NULL;
        routing = false;
+       routing2 = false;
 
        seq = lnet_get_dlc_seq_locked();
 
@@ -1382,7 +1384,7 @@ again:
        }
 
        /*
-        * STEP 1: first jab at determineing best_ni
+        * STEP 1: first jab at determining best_ni
         * if src_nid is explicitly specified, then best_ni is already
         * pre-determiend for us. Otherwise we need to select the best
         * one to use later on
@@ -1396,18 +1398,122 @@ again:
                                      libcfs_nid2str(src_nid));
                        return -EINVAL;
                }
+       }
+
+       if (msg->msg_type == LNET_MSG_REPLY ||
+           msg->msg_type == LNET_MSG_ACK ||
+           !peer->lp_multi_rail) {
+               /*
+                * for replies we want to respond on the same peer_ni we
+                * received the message on if possible. If not, then pick
+                * a peer_ni to send to
+                *
+                * if the peer is non-multi-rail then you want to send to
+                * the dst_nid provided as well.
+                *
+                * It is expected to find the lpni using dst_nid, since we
+                * created it earlier.
+                */
+               best_lpni = lnet_find_peer_ni_locked(dst_nid);
+               if (best_lpni)
+                       lnet_peer_ni_decref_locked(best_lpni);
+
+               if (best_lpni && !lnet_get_net_locked(LNET_NIDNET(dst_nid))) {
+                       /*
+                        * this lpni is not on a local network so we need
+                        * to route this reply.
+                        */
+                       best_gw = lnet_find_route_locked(NULL,
+                                                        best_lpni->lpni_nid,
+                                                        rtr_nid);
+                       if (best_gw) {
+                               /*
+                               * RULE: Each node considers only the next-hop
+                               *
+                               * We're going to route the message, so change the peer to
+                               * the router.
+                               */
+                               LASSERT(best_gw->lpni_peer_net);
+                               LASSERT(best_gw->lpni_peer_net->lpn_peer);
+                               peer = best_gw->lpni_peer_net->lpn_peer;
+
+                               /*
+                               * if the router is not multi-rail then use the best_gw
+                               * found to send the message to
+                               */
+                               if (!peer->lp_multi_rail)
+                                       best_lpni = best_gw;
+                               else
+                                       best_lpni = NULL;
 
-               if (best_ni->ni_net->net_id != LNET_NIDNET(dst_nid)) {
+                               routing = true;
+                       } else {
+                               best_lpni = NULL;
+                       }
+               } else if (!best_lpni) {
                        lnet_net_unlock(cpt);
-                       LCONSOLE_WARN("No route to %s via from %s\n",
-                                     libcfs_nid2str(dst_nid),
-                                     libcfs_nid2str(src_nid));
+                       CERROR("unable to send msg_type %d to "
+                             "originating %s. Destination NID not in DB\n",
+                             msg->msg_type, libcfs_nid2str(dst_nid));
                        return -EINVAL;
                }
-               goto pick_peer;
        }
 
        /*
+        * if the peer is not MR capable, then we should always send to it
+        * using the first NI in the NET we determined.
+        */
+       if (!peer->lp_multi_rail) {
+               if (!best_lpni) {
+                       lnet_net_unlock(cpt);
+                       CERROR("no route to %s\n",
+                              libcfs_nid2str(dst_nid));
+                       return -EHOSTUNREACH;
+               }
+
+               /* best ni could be set because src_nid was provided */
+               if (!best_ni) {
+                       best_ni = lnet_net2ni_locked(best_lpni->lpni_net->net_id, cpt);
+                       if (!best_ni) {
+                               lnet_net_unlock(cpt);
+                               CERROR("no path to %s from net %s\n",
+                               libcfs_nid2str(best_lpni->lpni_nid),
+                               libcfs_net2str(best_lpni->lpni_net->net_id));
+                               return -EHOSTUNREACH;
+                       }
+               }
+       }
+
+       if (best_ni == the_lnet.ln_loni) {
+               /* No send credit hassles with LOLND */
+               lnet_ni_addref_locked(best_ni, cpt);
+               msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
+               if (!msg->msg_routing)
+                       msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
+               msg->msg_target.nid = best_ni->ni_nid;
+               lnet_msg_commit(msg, cpt);
+               msg->msg_txni = best_ni;
+               lnet_net_unlock(cpt);
+
+               return LNET_CREDIT_OK;
+       }
+
+       /*
+        * if we already found a best_ni because src_nid is specified and
+        * best_lpni because we are replying to a message then just send
+        * the message
+        */
+       if (best_ni && best_lpni)
+               goto send;
+
+       /*
+        * If we already found a best_ni because src_nid is specified then
+        * pick the peer then send the message
+        */
+       if (best_ni)
+               goto pick_peer;
+
+       /*
         * Decide whether we need to route to peer_ni.
         * Get the local net that I need to be on to be able to directly
         * send to that peer.
@@ -1423,7 +1529,7 @@ again:
                        continue;
 
                local_net = lnet_get_net_locked(peer_net->lpn_net_id);
-               if (!local_net) {
+               if (!local_net && !routing) {
                        struct lnet_peer_ni *net_gw;
                        /*
                         * go through each peer_ni on that peer_net and
@@ -1444,14 +1550,11 @@ again:
 
                                if (!best_gw) {
                                        best_gw = net_gw;
-                                       best_lpni = lpni;
                                } else  {
                                        rc = lnet_compare_peers(net_gw,
                                                                best_gw);
-                                       if (rc > 0) {
+                                       if (rc > 0)
                                                best_gw = net_gw;
-                                               best_lpni = lpni;
-                                       }
                                }
                        }
 
@@ -1460,9 +1563,9 @@ again:
 
                        local_net = lnet_get_net_locked
                                        (LNET_NIDNET(best_gw->lpni_nid));
-                       routing = true;
+                       routing2 = true;
                } else {
-                       routing = false;
+                       routing2 = false;
                        best_gw = NULL;
                }
 
@@ -1523,12 +1626,17 @@ again:
                }
        }
 
-       /*
-        * if the peer is not MR capable, then we should always send to it
-        * using the first NI in the NET we determined.
-        */
-       if (!peer->lp_multi_rail && local_net != NULL)
-               best_ni = lnet_net2ni_locked(local_net->net_id, cpt);
+       if (routing2) {
+               /*
+                * RULE: Each node considers only the next-hop
+                *
+                * We're going to route the message, so change the peer to
+                * the router.
+                */
+               LASSERT(best_gw->lpni_peer_net);
+               LASSERT(best_gw->lpni_peer_net->lpn_peer);
+               peer = best_gw->lpni_peer_net->lpn_peer;
+       }
 
        if (!best_ni) {
                lnet_net_unlock(cpt);
@@ -1544,42 +1652,11 @@ again:
         */
        best_ni->ni_seq++;
 
-       if (routing)
-               goto send;
-
 pick_peer:
-       if (best_ni == the_lnet.ln_loni) {
-               /* No send credit hassles with LOLND */
-               lnet_ni_addref_locked(best_ni, cpt);
-               msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
-               if (!msg->msg_routing)
-                       msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
-               msg->msg_target.nid = best_ni->ni_nid;
-               lnet_msg_commit(msg, cpt);
-               msg->msg_txni = best_ni;
-               lnet_net_unlock(cpt);
-
-               return LNET_CREDIT_OK;
-       }
-
-       if (msg->msg_type == LNET_MSG_REPLY ||
-           msg->msg_type == LNET_MSG_ACK) {
-               /*
-                * for replies we want to respond on the same peer_ni we
-                * received the message on if possible. If not, then pick
-                * a peer_ni to send to
-                */
-               best_lpni = lnet_find_peer_ni_locked(dst_nid);
-               if (best_lpni) {
-                       lnet_peer_ni_decref_locked(best_lpni);
-                       goto send;
-               } else {
-                       CDEBUG(D_NET, "unable to send msg_type %d to "
-                             "originating %s\n", msg->msg_type,
-                             libcfs_nid2str(dst_nid));
-               }
-       }
-
+       /*
+        * At this point the best_ni is on a local network on which
+        * the peer has a peer_ni as well
+        */
        peer_net = lnet_peer_get_net_locked(peer,
                                            best_ni->ni_net->net_id);
        /*
@@ -1609,13 +1686,16 @@ pick_peer:
                        libcfs_nid2str(best_gw->lpni_nid),
                        lnet_msgtyp2str(msg->msg_type), msg->msg_len);
 
-               best_lpni = lnet_find_peer_ni_locked(dst_nid);
-               LASSERT(best_lpni != NULL);
-               lnet_peer_ni_decref_locked(best_lpni);
-
-               routing = true;
-
-               goto send;
+               routing2 = true;
+               /*
+                * RULE: Each node considers only the next-hop
+                *
+                * We're going to route the message, so change the peer to
+                * the router.
+                */
+               LASSERT(best_gw->lpni_peer_net);
+               LASSERT(best_gw->lpni_peer_net->lpn_peer);
+               peer = best_gw->lpni_peer_net->lpn_peer;
        } else if (!lnet_is_peer_net_healthy_locked(peer_net)) {
                /*
                 * this peer_net is unhealthy but we still have an opportunity
@@ -1638,6 +1718,7 @@ pick_peer:
        lpni = NULL;
        best_lpni_credits = INT_MIN;
        preferred = false;
+       best_lpni = NULL;
        while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
                /*
                 * if this peer ni is not healthy just skip it, no point in
@@ -1690,6 +1771,8 @@ pick_peer:
        }
 
 send:
+       routing = routing || routing2;
+
        /*
         * Increment sequence number of the peer selected so that we
         * pick the next one in Round Robin.
@@ -1697,13 +1780,6 @@ send:
        best_lpni->lpni_seq++;
 
        /*
-        * When routing the best gateway found acts as the best peer
-        * NI to send to.
-        */
-       if (routing)
-               best_lpni = best_gw;
-
-       /*
         * grab a reference on the peer_ni so it sticks around even if
         * we need to drop and relock the lnet_net_lock below.
         */
index aa85c5e..a1f6990 100644 (file)
@@ -258,11 +258,18 @@ lnet_try_destroy_peer_hierarchy_locked(struct lnet_peer_ni *lpni)
 }
 
 /* called with lnet_net_lock LNET_LOCK_EX held */
-static void
+static int
 lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 {
        struct lnet_peer_table *ptable = NULL;
 
+       /* don't remove a peer_ni if it's also a gateway */
+       if (lpni->lpni_rtr_refcount > 0) {
+               CERROR("Peer NI %s is a gateway. Can not delete it\n",
+                      libcfs_nid2str(lpni->lpni_nid));
+               return -EBUSY;
+       }
+
        lnet_peer_remove_from_remote_list(lpni);
 
        /* remove peer ni from the hash list. */
@@ -293,6 +300,8 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni)
 
        /* decrement reference on peer */
        lnet_peer_ni_decref_locked(lpni);
+
+       return 0;
 }
 
 void lnet_peer_uninit()
@@ -311,17 +320,22 @@ void lnet_peer_uninit()
        lnet_net_unlock(LNET_LOCK_EX);
 }
 
-static void
+static int
 lnet_peer_del_locked(struct lnet_peer *peer)
 {
        struct lnet_peer_ni *lpni = NULL, *lpni2;
+       int rc = 0, rc2 = 0;
 
        lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
        while (lpni != NULL) {
                lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni);
-               lnet_peer_ni_del_locked(lpni);
+               rc = lnet_peer_ni_del_locked(lpni);
+               if (rc != 0)
+                       rc2 = rc;
                lpni = lpni2;
        }
+
+       return rc2;
 }
 
 static void
@@ -893,6 +907,7 @@ lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid)
        lnet_nid_t local_nid;
        struct lnet_peer *peer;
        struct lnet_peer_ni *lpni;
+       int rc;
 
        if (key_nid == LNET_NID_ANY)
                return -EINVAL;
@@ -913,17 +928,17 @@ lnet_del_peer_ni_from_peer(lnet_nid_t key_nid, lnet_nid_t nid)
                 * entire peer
                 */
                lnet_net_lock(LNET_LOCK_EX);
-               lnet_peer_del_locked(peer);
+               rc = lnet_peer_del_locked(peer);
                lnet_net_unlock(LNET_LOCK_EX);
 
-               return 0;
+               return rc;
        }
 
        lnet_net_lock(LNET_LOCK_EX);
-       lnet_peer_ni_del_locked(lpni);
+       rc = lnet_peer_ni_del_locked(lpni);
        lnet_net_unlock(LNET_LOCK_EX);
 
-       return 0;
+       return rc;
 }
 
 void