From 71ca66bcd9c3a08edfbb67c9711c17428f599bcf Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Wed, 22 Apr 2020 17:06:23 -0700 Subject: [PATCH] LU-13471 lnet: use the same src nid for discovery When discovering a remote peer (not on the same network) a GET is sent to the peer to retrieve the peer's interfaces. This is followed by a PUSH, if discovery is on, to push the node's interfaces However, if both node and peer have multiple interfaces it is likely that the GET and the PUSH will originate on different interfaces. When the peer receives the PUSH it will not be able to connect the two NIDs and will not be able to consolidate the node's NIDs. This issue is specific for remote peers because at the time the push handler is invoked the remote lpni has not been created yet. lnet_parse() creates the lpni of the gateway. Similar to the strategy already in place of using the same source NID for all the messages of an RPC, discovery should use the same source NID for both the GET and PUSH. This patch stores the source NID interfaces the GET was sent on and uses it for the PUSH. Signed-off-by: Amir Shehata Change-Id: I5a13ab7799b2ddc47714202bcbed786b0d3940b7 Reviewed-on: https://review.whamcloud.com/38320 Tested-by: jenkins Reviewed-by: Chris Horn Tested-by: Maloo Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-types.h | 3 +++ lnet/lnet/peer.c | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index f0f4181..76ac7e3 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -593,6 +593,9 @@ struct lnet_peer { /* primary NID of the peer */ lnet_nid_t lp_primary_nid; + /* source NID to use during discovery */ + lnet_nid_t lp_disc_src_nid; + /* net to perform discovery on */ __u32 lp_disc_net_id; diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 210a96b..3719a74 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -258,6 +258,7 @@ lnet_peer_alloc(lnet_nid_t nid) init_waitqueue_head(&lp->lp_dc_waitq); spin_lock_init(&lp->lp_lock); lp->lp_primary_nid = nid; + lp->lp_disc_src_nid = LNET_NID_ANY; if (lnet_peers_start_down()) lp->lp_alive = false; else @@ -2291,6 +2292,8 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) spin_lock(&lp->lp_lock); + lp->lp_disc_src_nid = ev->target.nid; + /* * If some kind of error happened the contents of message * cannot be used. Set PING_FAILED to trigger a retry. @@ -3109,10 +3112,18 @@ __must_hold(&lp->lp_lock) goto fail_unlink; } - rc = LNetPut(LNET_NID_ANY, lp->lp_push_mdh, + rc = LNetPut(lp->lp_disc_src_nid, lp->lp_push_mdh, LNET_ACK_REQ, id, LNET_RESERVED_PORTAL, LNET_PROTO_PING_MATCHBITS, 0, 0); + /* + * reset the discovery nid. There is no need to restrict sending + * from that source, if we call lnet_push_update_to_peers(). It'll + * get set to a specific NID, if we initiate discovery from the + * scratch + */ + lp->lp_disc_src_nid = LNET_NID_ANY; + if (rc) goto fail_unlink; -- 1.8.3.1