Whamcloud - gitweb
LU-13782 lnet: Have LNet routers monitor the ni_fatal flag 53/39353/3
authorChris Horn <chris.horn@hpe.com>
Thu, 9 Jul 2020 18:33:49 +0000 (13:33 -0500)
committerOleg Drokin <green@whamcloud.com>
Fri, 7 Aug 2020 04:59:37 +0000 (04:59 +0000)
Have the LNet monitor thread on LNet routers check the
ni_fatal_error_on flag to set local NI status appropriately. When
this results in a status change, perform a discovery push to all
peers. This allows peers to update their route status appropriately.

Test-Parameters: trivial
HPE-bug-id: LUS-9068
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ic4f8f33c6377f4b95f6ab95f9714414c6b9ab5e6
Reviewed-on: https://review.whamcloud.com/39353
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/lnet/lib-move.c
lnet/lnet/router.c

index dbe8ba5..6bfe88d 100644 (file)
@@ -215,6 +215,35 @@ extern struct kmem_cache *lnet_small_mds_cachep; /* <= LNET_SMALL_MD_SIZE bytes
 extern struct kmem_cache *lnet_rspt_cachep;
 extern struct kmem_cache *lnet_msg_cachep;
 
+static inline bool
+lnet_ni_set_status_locked(struct lnet_ni *ni, __u32 status)
+__must_hold(&ni->ni_lock)
+{
+       bool update = false;
+
+       if (ni->ni_status && ni->ni_status->ns_status != status) {
+               CDEBUG(D_NET, "ni %s status changed from %#x to %#x\n",
+                      libcfs_nid2str(ni->ni_nid),
+                      ni->ni_status->ns_status, status);
+               ni->ni_status->ns_status = status;
+               update = true;
+       }
+
+       return update;
+}
+
+static inline bool
+lnet_ni_set_status(struct lnet_ni *ni, __u32 status)
+{
+       bool update;
+
+       lnet_ni_lock(ni);
+       update = lnet_ni_set_status_locked(ni, status);
+       lnet_ni_unlock(ni);
+
+       return update;
+}
+
 static inline void
 lnet_md_free(struct lnet_libmd *md)
 {
index bd947fc..cec2fd5 100644 (file)
@@ -4287,11 +4287,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
                spin_lock(&ni->ni_net->net_lock);
                ni->ni_net->net_last_alive = ktime_get_real_seconds();
                spin_unlock(&ni->ni_net->net_lock);
-               if (ni->ni_status != NULL &&
-                   ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) {
-                       ni->ni_status->ns_status = LNET_NI_STATUS_UP;
-                       push = true;
-               }
+               push = lnet_ni_set_status_locked(ni, LNET_NI_STATUS_UP);
                lnet_ni_unlock(ni);
        }
 
index 35f8c5d..4858ffc 100644 (file)
@@ -1049,15 +1049,9 @@ lnet_net_set_status_locked(struct lnet_net *net, __u32 status)
        struct lnet_ni *ni;
        bool update = false;
 
-       list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
-               lnet_ni_lock(ni);
-               if (ni->ni_status &&
-                   ni->ni_status->ns_status != status) {
-                   ni->ni_status->ns_status = status;
-                   update = true;
-               }
-               lnet_ni_unlock(ni);
-       }
+       list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
+               if (lnet_ni_set_status(ni, status))
+                       update = true;
 
        return update;
 }
@@ -1066,6 +1060,7 @@ static bool
 lnet_update_ni_status_locked(void)
 {
        struct lnet_net *net;
+       struct lnet_ni *ni;
        bool push = false;
        time64_t now;
        time64_t timeout;
@@ -1080,13 +1075,13 @@ lnet_update_ni_status_locked(void)
                        continue;
 
                if (now < net->net_last_alive + timeout)
-                       continue;
+                       goto check_ni_fatal;
 
                spin_lock(&net->net_lock);
                /* re-check with lock */
                if (now < net->net_last_alive + timeout) {
                        spin_unlock(&net->net_lock);
-                       continue;
+                       goto check_ni_fatal;
                }
                spin_unlock(&net->net_lock);
 
@@ -1095,7 +1090,25 @@ lnet_update_ni_status_locked(void)
                 * timeout on any of its constituent NIs, then mark all
                 * the NIs down.
                 */
-               push = lnet_net_set_status_locked(net, LNET_NI_STATUS_DOWN);
+               if (lnet_net_set_status_locked(net, LNET_NI_STATUS_DOWN)) {
+                       push = true;
+                       continue;
+               }
+
+check_ni_fatal:
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       /* lnet_ni_set_status() will perform the same check of
+                        * ni_status while holding the ni lock. We can safely
+                        * check ni_status without that lock because it is only
+                        * written to under net_lock/EX and our caller is
+                        * holding a net lock.
+                        */
+                       if (atomic_read(&ni->ni_fatal_error_on) &&
+                           ni->ni_status &&
+                           ni->ni_status->ns_status != LNET_NI_STATUS_DOWN &&
+                           lnet_ni_set_status(ni, LNET_NI_STATUS_DOWN))
+                               push = true;
+               }
        }
 
        return push;