From 3c580c93b8d3e94fac0ac2cf3cca2ff706c6497a Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Fri, 18 Oct 2019 14:16:53 -0500 Subject: [PATCH] LU-12889 lnet: Do not assume peers are MR capable If a peer has discovery disabled then it will not consolidate peer NI information. This means we need to use a consistent source NI when sending to it just like we do for non-MR peers. A comment in lnet_discovery_event_reply() indicates that this was a known issue, but the situation is not handled properly. Do not assume peers are multi-rail capable when peer objects are allocated and initialized. Do not mark a peer as multi-rail capable unless all of the following conditions are satisified: 1. The peer has the MR feature flag set 2. The peer has discovery enabled. 3. We have discovery enabled locally Note: 1, 2, and 3 above are implemented in the code for lnet_discovery_event_reply(), but code earlier in the function breaks this behavior. Remove the offending code. Update sanity-lnet tests 100 and 101 to reflect the fact that peers added via the traffic path no longer have multi-rail by default. Cray-bug-id: LUS-7918 Test-Parameters: trivial testlist=sanity-lnet Signed-off-by: Chris Horn Change-Id: Ia02bd446f4b2143fb490f56c1ff6103198316da3 Reviewed-on: https://review.whamcloud.com/36512 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Amir Shehata Reviewed-by: Serguei Smirnov Reviewed-by: Neil Brown Reviewed-by: Oleg Drokin --- lnet/lnet/peer.c | 46 +++++++++++++++------------------------------ lustre/tests/sanity-lnet.sh | 8 ++++---- 2 files changed, 19 insertions(+), 35 deletions(-) diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 1a0601f..b373f49 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -1530,11 +1530,7 @@ lnet_peer_ni_traffic_add(lnet_nid_t nid, lnet_nid_t pref) struct lnet_peer *lp; struct lnet_peer_net *lpn; struct lnet_peer_ni *lpni; - /* - * Assume peer is Multi-Rail capable and let discovery find out - * otherwise. - */ - unsigned flags = LNET_PEER_MULTI_RAIL; + unsigned flags = 0; int rc = 0; if (nid == LNET_NID_ANY) { @@ -2314,20 +2310,6 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) /* - * Only enable the multi-rail feature on the peer if both sides of - * the connection have discovery on - */ - if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) { - CDEBUG(D_NET, "Peer %s has Multi-Rail feature enabled\n", - libcfs_nid2str(lp->lp_primary_nid)); - lp->lp_state |= LNET_PEER_MULTI_RAIL; - } else { - CDEBUG(D_NET, "Peer %s has Multi-Rail feature disabled\n", - libcfs_nid2str(lp->lp_primary_nid)); - lp->lp_state &= ~LNET_PEER_MULTI_RAIL; - } - - /* * The peer may have discovery disabled at its end. Set * NO_DISCOVERY as appropriate. */ @@ -2349,22 +2331,24 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) */ if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) { if (lp->lp_state & LNET_PEER_MULTI_RAIL) { - /* Everything's fine */ + CDEBUG(D_NET, "peer %s(%p) is MR\n", + libcfs_nid2str(lp->lp_primary_nid), lp); } else if (lp->lp_state & LNET_PEER_CONFIGURED) { CWARN("Reply says %s is Multi-Rail, DLC says not\n", libcfs_nid2str(lp->lp_primary_nid)); + } else if (lnet_peer_discovery_disabled) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled locally\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled remotely\n", + libcfs_nid2str(lp->lp_primary_nid), lp); } else { - /* - * if discovery is disabled then we don't want to - * update the state of the peer. All we'll do is - * update the peer_nis which were reported back in - * the initial ping - */ - - if (!lnet_is_discovery_disabled_locked(lp)) { - lp->lp_state |= LNET_PEER_MULTI_RAIL; - lnet_peer_clr_non_mr_pref_nids(lp); - } + CDEBUG(D_NET, "peer %s(%p) is MR capable\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); } } else if (lp->lp_state & LNET_PEER_MULTI_RAIL) { if (lp->lp_state & LNET_PEER_CONFIGURED) { diff --git a/lustre/tests/sanity-lnet.sh b/lustre/tests/sanity-lnet.sh index a1a3b08..d9b6819 100755 --- a/lustre/tests/sanity-lnet.sh +++ b/lustre/tests/sanity-lnet.sh @@ -1049,7 +1049,7 @@ route: health_sensitivity: 1 peer: - primary nid: 7.7.7.7@tcp - Multi-Rail: True + Multi-Rail: False peer ni: - nid: 7.7.7.7@tcp EOF @@ -1091,15 +1091,15 @@ route: health_sensitivity: 1 peer: - primary nid: 8.8.8.9@tcp - Multi-Rail: True + Multi-Rail: False peer ni: - nid: 8.8.8.9@tcp - primary nid: 8.8.8.10@tcp - Multi-Rail: True + Multi-Rail: False peer ni: - nid: 8.8.8.10@tcp - primary nid: 8.8.8.8@tcp - Multi-Rail: True + Multi-Rail: False peer ni: - nid: 8.8.8.8@tcp EOF -- 1.8.3.1