Whamcloud - gitweb
LU-13648 lnet: Set remote NI status in lnet_notify 62/38862/2
authorChris Horn <chris.horn@hpe.com>
Fri, 22 May 2020 01:49:53 +0000 (20:49 -0500)
committerOleg Drokin <green@whamcloud.com>
Fri, 10 Jul 2020 16:52:38 +0000 (16:52 +0000)
The gnilnd receives node health information asynchronous from any tx
failure, so aliveness of lpni as reported by lnet_is_peer_ni_alive()
may not match what LND is telling us. Use existing reset flag to
set cached NI status down so we can be sure that remote NIs are
correctly set down.

Test-Parameters: trivial
HPE-bug-id: LUS-8897
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I1ab36b63d83fb35803eb13a330d698cfa49f17e9
Reviewed-on: https://review.whamcloud.com/38862
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/gnilnd/gnilnd.c
lnet/lnet/router.c

index bbc3daa..f1a7734 100644 (file)
@@ -606,8 +606,7 @@ kgnilnd_peer_notify(kgn_peer_t *peer, int error, int alive)
                                peer, libcfs_nid2str(peer_nid), peer->gnp_last_alive,
                                ktime_get_seconds() - peer->gnp_last_alive);
 
-                       lnet_notify(net->gnn_ni, peer_nid, alive,
-                                   (alive) ? true : false,
+                       lnet_notify(net->gnn_ni, peer_nid, alive, true,
                                    peer->gnp_last_alive);
 
                        kgnilnd_net_decref(net);
index 3fb67db..35f8c5d 100644 (file)
@@ -1694,12 +1694,11 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
        time64_t now = ktime_get_seconds();
        int cpt;
 
-       LASSERT (!in_interrupt ());
+       LASSERT(!in_interrupt());
 
-       CDEBUG (D_NET, "%s notifying %s: %s\n",
-               (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
-               libcfs_nid2str(nid),
-               alive ? "up" : "down");
+       CDEBUG(D_NET, "%s notifying %s: %s\n",
+              (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+              libcfs_nid2str(nid), alive ? "up" : "down");
 
        if (ni != NULL &&
            LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
@@ -1742,6 +1741,7 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
 
        if (alive) {
                if (reset) {
+                       lpni->lpni_ns_status = LNET_NI_STATUS_UP;
                        lnet_set_lpni_healthv_locked(lpni,
                                                     LNET_MAX_HEALTH_VALUE);
                } else {
@@ -1752,6 +1752,8 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
                                        (sensitivity) ? sensitivity :
                                        lnet_health_sensitivity);
                }
+       } else if (reset) {
+               lpni->lpni_ns_status = LNET_NI_STATUS_DOWN;
        }
 
        /* recalculate aliveness */