From: Chris Horn Date: Fri, 22 May 2020 01:49:53 +0000 (-0500) Subject: LU-13648 lnet: Set remote NI status in lnet_notify X-Git-Tag: 2.13.55~29 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=8010dbb6607664a613e6496d42ee70d40a15dc6a LU-13648 lnet: Set remote NI status in lnet_notify The gnilnd receives node health information asynchronous from any tx failure, so aliveness of lpni as reported by lnet_is_peer_ni_alive() may not match what LND is telling us. Use existing reset flag to set cached NI status down so we can be sure that remote NIs are correctly set down. Test-Parameters: trivial HPE-bug-id: LUS-8897 Signed-off-by: Chris Horn Change-Id: I1ab36b63d83fb35803eb13a330d698cfa49f17e9 Reviewed-on: https://review.whamcloud.com/38862 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Amir Shehata Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c index bbc3daa..f1a7734 100644 --- a/lnet/klnds/gnilnd/gnilnd.c +++ b/lnet/klnds/gnilnd/gnilnd.c @@ -606,8 +606,7 @@ kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive) peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive, ktime_get_seconds() - peer->gnp_last_alive); - lnet_notify(net->gnn_ni, peer_nid, alive, - (alive) ? true : false, + lnet_notify(net->gnn_ni, peer_nid, alive, true, peer->gnp_last_alive); kgnilnd_net_decref(net); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 3fb67db..35f8c5d 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -1694,12 +1694,11 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset, time64_t now = ktime_get_seconds(); int cpt; - LASSERT (!in_interrupt ()); + LASSERT(!in_interrupt()); - CDEBUG (D_NET, "%s notifying %s: %s\n", - (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), - libcfs_nid2str(nid), - alive ? "up" : "down"); + CDEBUG(D_NET, "%s notifying %s: %s\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), alive ? "up" : "down"); if (ni != NULL && LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) { @@ -1742,6 +1741,7 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset, if (alive) { if (reset) { + lpni->lpni_ns_status = LNET_NI_STATUS_UP; lnet_set_lpni_healthv_locked(lpni, LNET_MAX_HEALTH_VALUE); } else { @@ -1752,6 +1752,8 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset, (sensitivity) ? sensitivity : lnet_health_sensitivity); } + } else if (reset) { + lpni->lpni_ns_status = LNET_NI_STATUS_DOWN; } /* recalculate aliveness */