From 1896b521a3d71d4c2c104330934da93cbfd0a9a0 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Tue, 11 Apr 2023 08:37:43 -0700 Subject: [PATCH] EX-7251 lnet: update locking multiple NIDs of the MR peer Port updates to LU-16709 lnet: fix locking multiple NIDs of the MR peer This allows for the first of the two locked NIDs to stay primary as intended for the purpose of communicating with Lustre even if peer discovery succeeded using a different NID of MR peer. Lustre-change: https://review.whamcloud.com/50530 Lustre-commit: TBD (ddc9652a238e146e215157572b2e7e119de0e63b) Signed-off-by: Serguei Smirnov Change-Id: Ic66c3b6d4dec98540e4fa2d7fa51c0e5e2f442ed Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/50603 Reviewed-by: Frank Sehr Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Andreas Dilger --- lnet/lnet/peer.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 31b7f86..0a49691 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -1316,7 +1316,7 @@ LNetPrimaryNID(lnet_nid_t nid) /* force a full discovery cycle */ lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH | LNET_PEER_LOCK_PRIMARY; - lp->lp_prim_lock_ts = ktime_get_real_ns(); + lp->lp_prim_lock_ts = ktime_get_ns(); spin_unlock(&lp->lp_lock); /* start discovery in the background. Messages to that @@ -1443,7 +1443,7 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, } if (flags & LNET_PEER_LOCK_PRIMARY) { lp->lp_state |= LNET_PEER_LOCK_PRIMARY; - lp->lp_prim_lock_ts = ktime_get_real_ns(); + lp->lp_prim_lock_ts = ktime_get_ns(); } spin_unlock(&lp->lp_lock); @@ -1606,41 +1606,52 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) struct lnet_peer *lp2 = lpni->lpni_peer_net->lpn_peer; int rtr_refcount = lp2->lp_rtr_refcount; + unsigned peer2_state; + __u64 peer2_prim_lock_ts; /* If there's another peer that this NID belongs to - * and the primary NID for another peer is locked, + * and the primary NID for that peer is locked, * then, unless it is the only NID, we don't want * to mess with it. * But the configuration is wrong at this point, * so we should flag both of these peers as in a bad * state */ + spin_lock(&lp2->lp_lock); if (lp2->lp_state & LNET_PEER_LOCK_PRIMARY && lp2->lp_nnis > 1) { + lp2->lp_state |= LNET_PEER_BAD_CONFIG; + spin_unlock(&lp2->lp_lock); spin_lock(&lp->lp_lock); lp->lp_state |= LNET_PEER_BAD_CONFIG; spin_unlock(&lp->lp_lock); - spin_lock(&lp2->lp_lock); - lp2->lp_state |= LNET_PEER_BAD_CONFIG; - spin_unlock(&lp2->lp_lock); + CERROR("Peer %s NID %s is already locked with peer %s\n", + libcfs_nid2str(lp->lp_primary_nid), + libcfs_nid2str(nid), + libcfs_nid2str(lp2->lp_primary_nid)); goto out_free_lpni; } + peer2_state = lp2->lp_state; + peer2_prim_lock_ts = lp2->lp_prim_lock_ts; + spin_unlock(&lp2->lp_lock); /* If both peers have their primary NIDs locked, * the NID which got locked the earliest should be * kept as primary. In case if the peers were - * created with LNetPrimaryNID, this allows the + * created by Lustre, this allows the * first listed NID to stay primary as intended * for the purpose of communicating with Lustre * even if peer discovery succeeded using * a different NID of MR peer. */ - if (lp2->lp_state & LNET_PEER_LOCK_PRIMARY && + spin_lock(&lp->lp_lock); + if (peer2_state & LNET_PEER_LOCK_PRIMARY && lp->lp_state & LNET_PEER_LOCK_PRIMARY && - lp2->lp_prim_lock_ts < lp->lp_prim_lock_ts) { - lp->lp_prim_lock_ts = lp2->lp_prim_lock_ts; + peer2_prim_lock_ts < lp->lp_prim_lock_ts) { + lp->lp_prim_lock_ts = peer2_prim_lock_ts; lp->lp_primary_nid = nid; } + spin_unlock(&lp->lp_lock); /* * if we're trying to delete a router it means -- 1.8.3.1