Whamcloud - gitweb
LU-9119 lnet: fix race in lnet shutdown path 90/26690/4
authorOlaf Weber <olaf@sgi.com>
Fri, 27 Jan 2017 15:13:29 +0000 (16:13 +0100)
committerOleg Drokin <oleg.drokin@intel.com>
Fri, 5 May 2017 00:43:45 +0000 (00:43 +0000)
The locking changes for the lnet_net_lock made for Multi-Rail
introduce a race in the LNet shutdown path. The code keeps two
states in the_lnet.ln_shutdown: 0 means LNet is either up and
running or shut down, while 1 means lnet is shutting down. In
lnet_select_pathway() if we need to restart and drop and relock
the lnet_net_lock we can find that LNet went from running to
stopped, and not be able to tell the difference.

Replace ln_shutdown with a three-state ln_state patterned on
ln_rc_state: states are LNET_STATE_SHUTDOWN, LNET_STATE_RUNNING,
and LNET_STATE_STOPPING. Most checks against ln_shutdown now test
ln_state against LNET_STATE_RUNNING. LNet moves to RUNNING state
in lnet_startup_lndnets().

Test-Parameters: trivial
Signed-off-by: Olaf Weber <olaf@sgi.com>
Change-Id: I7afcbeb793dfa4d0a361e421ae06a99b7d4db903
Reviewed-on: https://review.whamcloud.com/26690
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/include/lnet/lib-types.h
lnet/lnet/api-ni.c
lnet/lnet/lib-move.c
lnet/lnet/lib-ptl.c
lnet/lnet/peer.c
lnet/lnet/router.c

index cfa4c39..c72771c 100644 (file)
@@ -734,6 +734,11 @@ struct lnet_msg_container {
 #define LNET_RC_STATE_RUNNING          1       /* started up OK */
 #define LNET_RC_STATE_STOPPING         2       /* telling thread to stop */
 
+/* LNet states */
+#define LNET_STATE_SHUTDOWN            0       /* not started */
+#define LNET_STATE_RUNNING             1       /* started up OK */
+#define LNET_STATE_STOPPING            2       /* telling thread to stop */
+
 typedef struct lnet {
        /* CPU partition table of LNet */
        struct cfs_cpt_table            *ln_cpt_table;
@@ -812,8 +817,8 @@ typedef struct lnet {
        int                             ln_niinit_self;
        /* LNetNIInit/LNetNIFini counter */
        int                             ln_refcount;
-       /* shutdown in progress */
-       int                             ln_shutdown;
+       /* SHUTDOWN/RUNNING/STOPPING */
+       int                             ln_state;
 
        int                             ln_routing;     /* am I a router? */
        lnet_pid_t                      ln_pid;         /* requested pid */
index 204fe49..e63feac 100644 (file)
@@ -1354,11 +1354,11 @@ lnet_shutdown_lndnets(void)
        /* NB called holding the global mutex */
 
        /* All quiet on the API front */
-       LASSERT(!the_lnet.ln_shutdown);
+       LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
        LASSERT(the_lnet.ln_refcount == 0);
 
        lnet_net_lock(LNET_LOCK_EX);
-       the_lnet.ln_shutdown = 1;       /* flag shutdown */
+       the_lnet.ln_state = LNET_STATE_STOPPING;
 
        while (!list_empty(&the_lnet.ln_nets)) {
                /*
@@ -1386,7 +1386,7 @@ lnet_shutdown_lndnets(void)
        }
 
        lnet_net_lock(LNET_LOCK_EX);
-       the_lnet.ln_shutdown = 0;
+       the_lnet.ln_state = LNET_STATE_SHUTDOWN;
        lnet_net_unlock(LNET_LOCK_EX);
 }
 
@@ -1657,6 +1657,15 @@ lnet_startup_lndnets(struct list_head *netlist)
        int                     rc;
        int                     ni_count = 0;
 
+       /*
+        * Change to running state before bringing up the LNDs. This
+        * allows lnet_shutdown_lndnets() to assert that we've passed
+        * through here.
+        */
+       lnet_net_lock(LNET_LOCK_EX);
+       the_lnet.ln_state = LNET_STATE_RUNNING;
+       lnet_net_unlock(LNET_LOCK_EX);
+
        while (!list_empty(netlist)) {
                net = list_entry(netlist->next, struct lnet_net, net_list);
                list_del_init(&net->net_list);
index 5f4045e..ec3051c 100644 (file)
@@ -1423,7 +1423,7 @@ again:
 
        seq = lnet_get_dlc_seq_locked();
 
-       if (the_lnet.ln_shutdown) {
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
                lnet_net_unlock(cpt);
                return -ESHUTDOWN;
        }
index ec5151e..3773ed9 100644 (file)
@@ -584,7 +584,7 @@ lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg)
        mtable = lnet_mt_of_match(info, msg);
        lnet_res_lock(mtable->mt_cpt);
 
-       if (the_lnet.ln_shutdown) {
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
                rc = LNET_MATCHMD_DROP;
                goto out1;
        }
@@ -946,7 +946,7 @@ lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason)
                                list_move(&msg->msg_list, &zombies);
                }
        } else {
-               if (the_lnet.ln_shutdown)
+               if (the_lnet.ln_state != LNET_STATE_RUNNING)
                        CWARN("Active lazy portal %d on exit\n", portal);
                else
                        CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
index 7b506a4..47e5ce2 100644 (file)
@@ -427,7 +427,7 @@ lnet_peer_tables_cleanup(struct lnet_net *net)
        int                             i;
        struct lnet_peer_table          *ptable;
 
-       LASSERT(the_lnet.ln_shutdown || net != NULL);
+       LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net != NULL);
        /* If just deleting the peers for a NI, get rid of any routes these
         * peers are gateways for. */
        cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) {
@@ -453,7 +453,7 @@ lnet_get_peer_ni_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
        struct list_head        *peers;
        struct lnet_peer_ni     *lp;
 
-       LASSERT(!the_lnet.ln_shutdown);
+       LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
        peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
        list_for_each_entry(lp, peers, lpni_hashlist) {
@@ -994,7 +994,7 @@ lnet_nid2peerni_ex(lnet_nid_t nid, int cpt)
        struct lnet_peer_ni *lpni = NULL;
        int rc;
 
-       if (the_lnet.ln_shutdown) /* it's shutting down */
+       if (the_lnet.ln_state != LNET_STATE_RUNNING)
                return ERR_PTR(-ESHUTDOWN);
 
        /*
@@ -1028,7 +1028,7 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
        struct lnet_peer_ni *lpni = NULL;
        int rc;
 
-       if (the_lnet.ln_shutdown) /* it's shutting down */
+       if (the_lnet.ln_state != LNET_STATE_RUNNING)
                return ERR_PTR(-ESHUTDOWN);
 
        /*
@@ -1057,7 +1057,7 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt)
         * Shutdown is only set under the ln_api_lock, so a single
         * check here is sufficent.
         */
-       if (the_lnet.ln_shutdown) {
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
                lpni = ERR_PTR(-ESHUTDOWN);
                goto out_mutex_unlock;
        }
index 0e929bf..aba5699 100644 (file)
@@ -249,7 +249,7 @@ lnet_find_rnet_locked(__u32 net)
        struct list_head *tmp;
        struct list_head *rn_list;
 
-       LASSERT(!the_lnet.ln_shutdown);
+       LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
        rn_list = lnet_net2rnethash(net);
        list_for_each(tmp, rn_list) {
@@ -381,7 +381,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway,
                return rc;
        }
        route->lr_gateway = lpni;
-       LASSERT(!the_lnet.ln_shutdown);
+       LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
 
        rnet2 = lnet_find_rnet_locked(net);
        if (rnet2 == NULL) {
@@ -1804,7 +1804,7 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 
        lnet_net_lock(cpt);
 
-       if (the_lnet.ln_shutdown) {
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
                lnet_net_unlock(cpt);
                return -ESHUTDOWN;
        }