From 584d9e46053234d02a3290822317552785e44e76 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Thu, 6 Aug 2020 16:24:57 -0500 Subject: [PATCH] LU-13883 lnet: Lookup lpni after discovery The lpni for a nid can change as part of the discovery process (see lnet_peer_add_nid()). As such, callers of lnet_discover_peer_locked() need to lookup the lpni again after discovery completes to make sure they get the correct peer. An exception is lnet_check_routers() which doesn't do anything with the peer or peer NI after the call to lnet_discover_peer_locked(). If the router list is changed then lnet_check_routers() will already repeat discovery. HPE-bug-id: LUS-9167 Signed-off-by: Chris Horn Change-Id: I8bdfcb957e87f65ce65bfad81858a4ce3362298e Reviewed-on: https://review.whamcloud.com/39747 Reviewed-by: Serguei Smirnov Tested-by: jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-lnet.h | 1 + lnet/lnet/api-ni.c | 12 ++++++++++++ lnet/lnet/lib-move.c | 30 ++++++++++++++++++++++++------ lnet/lnet/peer.c | 30 ++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index b719414..fc4db52 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -889,6 +889,7 @@ struct lnet_peer *lnet_find_peer(lnet_nid_t nid); void lnet_peer_net_added(struct lnet_net *net); lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid); int lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block); +void lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg); int lnet_peer_discovery_start(void); void lnet_peer_discovery_stop(void); void lnet_push_update_to_peers(int force); diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 4c3f6a4..1b7cb7e 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -4637,6 +4637,18 @@ lnet_discover(struct lnet_process_id id, __u32 force, if (rc) goto out_decref; + /* The lpni (or lp) for this NID may have changed and our ref is + * the only thing keeping the old one around. Release the ref + * and lookup the lpni again + */ + lnet_peer_ni_decref_locked(lpni); + lpni = lnet_find_peer_ni_locked(id.nid); + if (!lpni) { + rc = -ENOENT; + goto out; + } + lp = lpni->lpni_peer_net->lpn_peer; + i = 0; p = NULL; while ((p = lnet_get_next_peer_ni_locked(lp, NULL, p)) != NULL) { diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index f69b21f..3f0f07c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2031,6 +2031,7 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg, int cpt) { struct lnet_peer *peer; + struct lnet_peer_ni *new_lpni; int rc; lnet_peer_ni_addref_locked(lpni); @@ -2052,21 +2053,38 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg, lnet_peer_ni_decref_locked(lpni); return rc; } - /* The peer may have changed. */ - peer = lpni->lpni_peer_net->lpn_peer; + + new_lpni = lnet_find_peer_ni_locked(lpni->lpni_nid); + if (!new_lpni) { + lnet_peer_ni_decref_locked(lpni); + return -ENOENT; + } + + peer = new_lpni->lpni_peer_net->lpn_peer; spin_lock(&peer->lp_lock); - if (lnet_peer_is_uptodate_locked(peer)) { + if (lpni == new_lpni && lnet_peer_is_uptodate_locked(peer)) { + /* The peer NI did not change and the peer is up to date. + * Nothing more to do. + */ spin_unlock(&peer->lp_lock); lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); return 0; } - /* queue message and return */ + spin_unlock(&peer->lp_lock); + + /* Either the peer NI changed during discovery, or the peer isn't up + * to date. In both cases we want to queue the message on the + * (possibly new) peer's pending queue and queue the peer for discovery + */ msg->msg_sending = 0; msg->msg_txpeer = NULL; - list_add_tail(&msg->msg_list, &peer->lp_dc_pendq); - spin_unlock(&peer->lp_lock); + lnet_net_unlock(cpt); + lnet_peer_queue_message(peer, msg); + lnet_net_lock(cpt); lnet_peer_ni_decref_locked(lpni); + lnet_peer_ni_decref_locked(new_lpni); CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n", msg, libcfs_nid2str(peer->lp_primary_nid)); diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 9a6780c..6ec793b 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -1357,6 +1357,16 @@ LNetPrimaryNID(lnet_nid_t nid) rc = lnet_discover_peer_locked(lpni, cpt, true); if (rc) goto out_decref; + /* The lpni (or lp) for this NID may have changed and our ref is + * the only thing keeping the old one around. Release the ref + * and lookup the lpni again + */ + lnet_peer_ni_decref_locked(lpni); + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + rc = -ENOENT; + goto out_unlock; + } lp = lpni->lpni_peer_net->lpn_peer; /* Only try once if discovery is disabled */ @@ -2063,6 +2073,26 @@ __must_hold(&lp->lp_lock) return rc; } +/* Add the message to the peer's lp_dc_pendq and queue the peer for discovery */ +void +lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg) +{ + /* The discovery thread holds net_lock/EX and lp_lock when it splices + * the lp_dc_pendq onto a local list for resending. Thus, we do the same + * when adding to the list and queuing the peer to ensure that we do not + * strand any messages on the lp_dc_pendq. This scheme ensures the + * message will be resent even if the peer is already being discovered. + * Therefore we needn't check the return value of + * lnet_peer_queue_for_discovery(lp). + */ + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lp->lp_lock); + list_add_tail(&msg->msg_list, &lp->lp_dc_pendq); + spin_unlock(&lp->lp_lock); + lnet_peer_queue_for_discovery(lp); + lnet_net_unlock(LNET_LOCK_EX); +} + /* * Queue a peer for the attention of the discovery thread. Call with * lnet_net_lock/EX held. Returns 0 if the peer was queued, and -- 1.8.3.1