Whamcloud - gitweb
LU-15616 lnet: ln_api_mutex deadlocks 27/46727/7
authorChris Horn <chris.horn@hpe.com>
Mon, 7 Mar 2022 17:03:50 +0000 (11:03 -0600)
committerOleg Drokin <green@whamcloud.com>
Sun, 3 Apr 2022 16:08:52 +0000 (16:08 +0000)
LNetNIFini() acquires the ln_api_mutex and holds onto it throughout
various shutdown routines. Meanwhile, LND threads (via
lnet_nid2peerni_locked()) or the discovery thread (via
lnet_peer_data_present()) may need to acquire this mutex in order to
progress.

Address these potential deadlocks by setting the_lnet.ln_state to
LNET_STATE_STOPPING earlier in LNetNIFini(), and release the mutex
prior to any call into LND module or before any wait.

LNetNIInit() is modified to return -ESHUTDOWN if it finds that there
is a concurrent shutdown in progress.

HPE-bug-id: LUS-10681
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ia8b28cc95ff71e66a0f99aed4f2c22ec9d44ce1e
Reviewed-on: https://review.whamcloud.com/46727
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/api-ni.c
lnet/lnet/lib-move.c
lnet/lnet/peer.c

index ad990de..76cac6e 100644 (file)
@@ -1350,7 +1350,7 @@ lnet_prepare(lnet_pid_t requested_pid)
 }
 
 static int
-lnet_unprepare (void)
+lnet_unprepare(void)
 {
        /* NB no LNET_LOCK since this is the last reference.  All LND instances
         * have shut down already, so it is safe to unlink and free all
@@ -2322,14 +2322,16 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net)
                islo = ni->ni_net->net_lnd->lnd_type == LOLND;
 
                LASSERT(!in_interrupt());
-               /* Holding the mutex makes it safe for lnd_shutdown
+               /* Holding the LND mutex makes it safe for lnd_shutdown
                 * to call module_put(). Module unload cannot finish
                 * until lnet_unregister_lnd() completes, and that
-                * requires the mutex.
+                * requires the LND mutex.
                 */
+               mutex_unlock(&the_lnet.ln_api_mutex);
                mutex_lock(&the_lnet.ln_lnd_mutex);
                (net->net_lnd->lnd_shutdown)(ni);
                mutex_unlock(&the_lnet.ln_lnd_mutex);
+               mutex_lock(&the_lnet.ln_api_mutex);
 
                if (!islo)
                        CDEBUG(D_LNI, "Removed LNI %s\n",
@@ -2400,7 +2402,8 @@ lnet_shutdown_lndnets(void)
        /* NB called holding the global mutex */
 
        /* All quiet on the API front */
-       LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING);
+       LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING ||
+               the_lnet.ln_state == LNET_STATE_STOPPING);
        LASSERT(the_lnet.ln_refcount == 0);
 
        lnet_net_lock(LNET_LOCK_EX);
@@ -2901,6 +2904,11 @@ LNetNIInit(lnet_pid_t requested_pid)
 
        CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
 
+       if (the_lnet.ln_state == LNET_STATE_STOPPING) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return -ESHUTDOWN;
+       }
+
        if (the_lnet.ln_refcount > 0) {
                rc = the_lnet.ln_refcount++;
                mutex_unlock(&the_lnet.ln_api_mutex);
@@ -3046,6 +3054,10 @@ LNetNIFini(void)
        } else {
                LASSERT(!the_lnet.ln_niinit_self);
 
+               lnet_net_lock(LNET_LOCK_EX);
+               the_lnet.ln_state = LNET_STATE_STOPPING;
+               lnet_net_unlock(LNET_LOCK_EX);
+
                lnet_fault_fini();
 
                lnet_router_debugfs_fini();
@@ -3514,6 +3526,10 @@ static int lnet_handle_legacy_ip2nets(char *ip2nets,
        lnet_set_tune_defaults(tun);
 
        mutex_lock(&the_lnet.ln_api_mutex);
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               rc = -ESHUTDOWN;
+               goto out;
+       }
        while (!list_empty(&net_head)) {
                net = list_entry(net_head.next, struct lnet_net, net_list);
                list_del_init(&net->net_list);
@@ -3577,8 +3593,10 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf)
        lnet_set_tune_defaults(tun);
 
        mutex_lock(&the_lnet.ln_api_mutex);
-
-       rc = lnet_add_net_common(net, tun);
+       if (the_lnet.ln_state != LNET_STATE_RUNNING)
+               rc = -ESHUTDOWN;
+       else
+               rc = lnet_add_net_common(net, tun);
 
        mutex_unlock(&the_lnet.ln_api_mutex);
 
@@ -3601,6 +3619,10 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf)
                return -EINVAL;
 
        mutex_lock(&the_lnet.ln_api_mutex);
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               rc = -ESHUTDOWN;
+               goto unlock_api_mutex;
+       }
 
        lnet_net_lock(0);
 
@@ -3694,6 +3716,10 @@ lnet_dyn_add_net(struct lnet_ioctl_config_data *conf)
                return rc == 0 ? -EINVAL : rc;
 
        mutex_lock(&the_lnet.ln_api_mutex);
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               rc = -ESHUTDOWN;
+               goto out_unlock_clean;
+       }
 
        if (rc > 1) {
                rc = -EINVAL; /* only add one network per call */
@@ -3746,6 +3772,10 @@ lnet_dyn_del_net(__u32 net_id)
                return -EINVAL;
 
        mutex_lock(&the_lnet.ln_api_mutex);
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               rc = -ESHUTDOWN;
+               goto out;
+       }
 
        lnet_net_lock(0);
 
index 34873e8..9aae6ac 100644 (file)
@@ -4134,7 +4134,9 @@ void lnet_monitor_thr_stop(void)
        complete(&the_lnet.ln_mt_wait_complete);
 
        /* block until monitor thread signals that it's done */
+       mutex_unlock(&the_lnet.ln_api_mutex);
        down(&the_lnet.ln_mt_signal);
+       mutex_lock(&the_lnet.ln_api_mutex);
        LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN);
 
        /* perform cleanup tasks */
index ad77384..72b1382 100644 (file)
@@ -3262,12 +3262,15 @@ __must_hold(&lp->lp_lock)
        if (lp->lp_state & LNET_PEER_MARK_DELETED)
                return 0;
 
-       if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
-               return -ESHUTDOWN;
-
        spin_unlock(&lp->lp_lock);
 
        mutex_lock(&the_lnet.ln_api_mutex);
+       if (the_lnet.ln_state != LNET_STATE_RUNNING ||
+           the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               spin_lock(&lp->lp_lock);
+               return -ESHUTDOWN;
+       }
 
        lnet_net_lock(LNET_LOCK_EX);
        /* remove the peer from the discovery work
@@ -3958,8 +3961,10 @@ void lnet_peer_discovery_stop(void)
        else
                wake_up(&the_lnet.ln_dc_waitq);
 
+       mutex_unlock(&the_lnet.ln_api_mutex);
        wait_event(the_lnet.ln_dc_waitq,
                   the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
+       mutex_lock(&the_lnet.ln_api_mutex);
 
        LASSERT(list_empty(&the_lnet.ln_dc_request));
        LASSERT(list_empty(&the_lnet.ln_dc_working));