Whamcloud - gitweb
LU-12889 lnet: Do not assume peers are MR capable 12/36512/8
authorChris Horn <hornc@cray.com>
Fri, 18 Oct 2019 19:16:53 +0000 (14:16 -0500)
committerOleg Drokin <green@whamcloud.com>
Sat, 1 Feb 2020 08:10:23 +0000 (08:10 +0000)
If a peer has discovery disabled then it will not consolidate peer
NI information. This means we need to use a consistent source NI
when sending to it just like we do for non-MR peers.

A comment in lnet_discovery_event_reply() indicates that this was a
known issue, but the situation is not handled properly.

Do not assume peers are multi-rail capable when peer objects are
allocated and initialized.

Do not mark a peer as multi-rail capable unless all of the following
conditions are satisified:
1. The peer has the MR feature flag set
2. The peer has discovery enabled.
3. We have discovery enabled locally

Note: 1, 2, and 3 above are implemented in the code for
lnet_discovery_event_reply(), but code earlier in the function breaks
this behavior. Remove the offending code.

Update sanity-lnet tests 100 and 101 to reflect the fact that peers
added via the traffic path no longer have multi-rail by default.

Cray-bug-id: LUS-7918
Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: Ia02bd446f4b2143fb490f56c1ff6103198316da3
Reviewed-on: https://review.whamcloud.com/36512
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/peer.c
lustre/tests/sanity-lnet.sh

index 1a0601f..b373f49 100644 (file)
@@ -1530,11 +1530,7 @@ lnet_peer_ni_traffic_add(lnet_nid_t nid, lnet_nid_t pref)
        struct lnet_peer *lp;
        struct lnet_peer_net *lpn;
        struct lnet_peer_ni *lpni;
        struct lnet_peer *lp;
        struct lnet_peer_net *lpn;
        struct lnet_peer_ni *lpni;
-       /*
-        * Assume peer is Multi-Rail capable and let discovery find out
-        * otherwise.
-        */
-       unsigned flags = LNET_PEER_MULTI_RAIL;
+       unsigned flags = 0;
        int rc = 0;
 
        if (nid == LNET_NID_ANY) {
        int rc = 0;
 
        if (nid == LNET_NID_ANY) {
@@ -2314,20 +2310,6 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev)
 
 
        /*
 
 
        /*
-        * Only enable the multi-rail feature on the peer if both sides of
-        * the connection have discovery on
-        */
-       if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) {
-               CDEBUG(D_NET, "Peer %s has Multi-Rail feature enabled\n",
-                      libcfs_nid2str(lp->lp_primary_nid));
-               lp->lp_state |= LNET_PEER_MULTI_RAIL;
-       } else {
-               CDEBUG(D_NET, "Peer %s has Multi-Rail feature disabled\n",
-                      libcfs_nid2str(lp->lp_primary_nid));
-               lp->lp_state &= ~LNET_PEER_MULTI_RAIL;
-       }
-
-       /*
         * The peer may have discovery disabled at its end. Set
         * NO_DISCOVERY as appropriate.
         */
         * The peer may have discovery disabled at its end. Set
         * NO_DISCOVERY as appropriate.
         */
@@ -2349,22 +2331,24 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev)
         */
        if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) {
                if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
         */
        if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) {
                if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
-                       /* Everything's fine */
+                       CDEBUG(D_NET, "peer %s(%p) is MR\n",
+                              libcfs_nid2str(lp->lp_primary_nid), lp);
                } else if (lp->lp_state & LNET_PEER_CONFIGURED) {
                        CWARN("Reply says %s is Multi-Rail, DLC says not\n",
                              libcfs_nid2str(lp->lp_primary_nid));
                } else if (lp->lp_state & LNET_PEER_CONFIGURED) {
                        CWARN("Reply says %s is Multi-Rail, DLC says not\n",
                              libcfs_nid2str(lp->lp_primary_nid));
+               } else if (lnet_peer_discovery_disabled) {
+                       CDEBUG(D_NET,
+                              "peer %s(%p) not MR: DD disabled locally\n",
+                              libcfs_nid2str(lp->lp_primary_nid), lp);
+               } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) {
+                       CDEBUG(D_NET,
+                              "peer %s(%p) not MR: DD disabled remotely\n",
+                              libcfs_nid2str(lp->lp_primary_nid), lp);
                } else {
                } else {
-                       /*
-                        * if discovery is disabled then we don't want to
-                        * update the state of the peer. All we'll do is
-                        * update the peer_nis which were reported back in
-                        * the initial ping
-                        */
-
-                       if (!lnet_is_discovery_disabled_locked(lp)) {
-                               lp->lp_state |= LNET_PEER_MULTI_RAIL;
-                               lnet_peer_clr_non_mr_pref_nids(lp);
-                       }
+                       CDEBUG(D_NET, "peer %s(%p) is MR capable\n",
+                              libcfs_nid2str(lp->lp_primary_nid), lp);
+                       lp->lp_state |= LNET_PEER_MULTI_RAIL;
+                       lnet_peer_clr_non_mr_pref_nids(lp);
                }
        } else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
                if (lp->lp_state & LNET_PEER_CONFIGURED) {
                }
        } else if (lp->lp_state & LNET_PEER_MULTI_RAIL) {
                if (lp->lp_state & LNET_PEER_CONFIGURED) {
index a1a3b08..d9b6819 100755 (executable)
@@ -1049,7 +1049,7 @@ route:
       health_sensitivity: 1
 peer:
     - primary nid: 7.7.7.7@tcp
       health_sensitivity: 1
 peer:
     - primary nid: 7.7.7.7@tcp
-      Multi-Rail: True
+      Multi-Rail: False
       peer ni:
         - nid: 7.7.7.7@tcp
 EOF
       peer ni:
         - nid: 7.7.7.7@tcp
 EOF
@@ -1091,15 +1091,15 @@ route:
       health_sensitivity: 1
 peer:
     - primary nid: 8.8.8.9@tcp
       health_sensitivity: 1
 peer:
     - primary nid: 8.8.8.9@tcp
-      Multi-Rail: True
+      Multi-Rail: False
       peer ni:
         - nid: 8.8.8.9@tcp
     - primary nid: 8.8.8.10@tcp
       peer ni:
         - nid: 8.8.8.9@tcp
     - primary nid: 8.8.8.10@tcp
-      Multi-Rail: True
+      Multi-Rail: False
       peer ni:
         - nid: 8.8.8.10@tcp
     - primary nid: 8.8.8.8@tcp
       peer ni:
         - nid: 8.8.8.10@tcp
     - primary nid: 8.8.8.8@tcp
-      Multi-Rail: True
+      Multi-Rail: False
       peer ni:
         - nid: 8.8.8.8@tcp
 EOF
       peer ni:
         - nid: 8.8.8.8@tcp
 EOF