From: Chris Horn Date: Mon, 7 Mar 2022 17:03:50 +0000 (-0600) Subject: LU-15616 lnet: ln_api_mutex deadlocks X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=4b14d9fb00214abffcbe96a5c0759108d194cbf8;p=fs%2Flustre-release.git LU-15616 lnet: ln_api_mutex deadlocks LNetNIFini() acquires the ln_api_mutex and holds onto it throughout various shutdown routines. Meanwhile, LND threads (via lnet_nid2peerni_locked()) or the discovery thread (via lnet_peer_data_present()) may need to acquire this mutex in order to progress. Address these potential deadlocks by setting the_lnet.ln_state to LNET_STATE_STOPPING earlier in LNetNIFini(), and release the mutex prior to any call into LND module or before any wait. LNetNIInit() is modified to return -ESHUTDOWN if it finds that there is a concurrent shutdown in progress. Lustre-change: https://review.whamcloud.com/46727 Lustre-commit: 22de0bd145b649768b16dd42559d326af3c13200 Test-Parameters: trivial HPE-bug-id: LUS-10681 Signed-off-by: Chris Horn Change-Id: Ia8b28cc95ff71e66a0f99aed4f2c22ec9d44ce1e Reviewed-on: https://review.whamcloud.com/48384 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Frank Sehr Reviewed-by: Andreas Dilger --- diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index ff7d230..48350f2 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1296,7 +1296,7 @@ lnet_prepare(lnet_pid_t requested_pid) } static int -lnet_unprepare (void) +lnet_unprepare(void) { /* NB no LNET_LOCK since this is the last reference. All LND instances * have shut down already, so it is safe to unlink and free all @@ -2144,14 +2144,16 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net) islo = ni->ni_net->net_lnd->lnd_type == LOLND; LASSERT(!in_interrupt()); - /* Holding the mutex makes it safe for lnd_shutdown + /* Holding the LND mutex makes it safe for lnd_shutdown * to call module_put(). Module unload cannot finish * until lnet_unregister_lnd() completes, and that - * requires the mutex. + * requires the LND mutex. */ + mutex_unlock(&the_lnet.ln_api_mutex); mutex_lock(&the_lnet.ln_lnd_mutex); (net->net_lnd->lnd_shutdown)(ni); mutex_unlock(&the_lnet.ln_lnd_mutex); + mutex_lock(&the_lnet.ln_api_mutex); if (!islo) CDEBUG(D_LNI, "Removed LNI %s\n", @@ -2222,7 +2224,8 @@ lnet_shutdown_lndnets(void) /* NB called holding the global mutex */ /* All quiet on the API front */ - LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING || + the_lnet.ln_state == LNET_STATE_STOPPING); LASSERT(the_lnet.ln_refcount == 0); lnet_net_lock(LNET_LOCK_EX); @@ -2646,6 +2649,11 @@ LNetNIInit(lnet_pid_t requested_pid) CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount); + if (the_lnet.ln_state == LNET_STATE_STOPPING) { + mutex_unlock(&the_lnet.ln_api_mutex); + return -ESHUTDOWN; + } + if (the_lnet.ln_refcount > 0) { rc = the_lnet.ln_refcount++; mutex_unlock(&the_lnet.ln_api_mutex); @@ -2791,6 +2799,10 @@ LNetNIFini(void) } else { LASSERT(!the_lnet.ln_niinit_self); + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_STOPPING; + lnet_net_unlock(LNET_LOCK_EX); + lnet_fault_fini(); lnet_router_debugfs_fini(); @@ -3227,6 +3239,10 @@ static int lnet_handle_legacy_ip2nets(char *ip2nets, lnet_set_tune_defaults(tun); mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out; + } while (!list_empty(&net_head)) { net = list_entry(net_head.next, struct lnet_net, net_list); list_del_init(&net->net_list); @@ -3290,8 +3306,10 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf) lnet_set_tune_defaults(tun); mutex_lock(&the_lnet.ln_api_mutex); - - rc = lnet_add_net_common(net, tun); + if (the_lnet.ln_state != LNET_STATE_RUNNING) + rc = -ESHUTDOWN; + else + rc = lnet_add_net_common(net, tun); mutex_unlock(&the_lnet.ln_api_mutex); @@ -3314,6 +3332,10 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf) return -EINVAL; mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto unlock_api_mutex; + } lnet_net_lock(0); @@ -3407,6 +3429,10 @@ lnet_dyn_add_net(struct lnet_ioctl_config_data *conf) return rc == 0 ? -EINVAL : rc; mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out_unlock_clean; + } if (rc > 1) { rc = -EINVAL; /* only add one network per call */ @@ -3459,6 +3485,10 @@ lnet_dyn_del_net(__u32 net_id) return -EINVAL; mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out; + } lnet_net_lock(0); diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 4ad8cff..2a1eaf0 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -3988,7 +3988,9 @@ void lnet_monitor_thr_stop(void) complete(&the_lnet.ln_mt_wait_complete); /* block until monitor thread signals that it's done */ + mutex_unlock(&the_lnet.ln_api_mutex); down(&the_lnet.ln_mt_signal); + mutex_lock(&the_lnet.ln_api_mutex); LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN); /* perform cleanup tasks */ diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 02076be..a1bd1d4 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -2976,12 +2976,15 @@ __must_hold(&lp->lp_lock) if (lp->lp_state & LNET_PEER_MARK_DELETED) return 0; - if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) - return -ESHUTDOWN; - spin_unlock(&lp->lp_lock); mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING || + the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) { + mutex_unlock(&the_lnet.ln_api_mutex); + spin_lock(&lp->lp_lock); + return -ESHUTDOWN; + } lnet_net_lock(LNET_LOCK_EX); /* remove the peer from the discovery work @@ -3664,8 +3667,10 @@ void lnet_peer_discovery_stop(void) else wake_up(&the_lnet.ln_dc_waitq); + mutex_unlock(&the_lnet.ln_api_mutex); wait_event(the_lnet.ln_dc_waitq, the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN); + mutex_lock(&the_lnet.ln_api_mutex); LASSERT(list_empty(&the_lnet.ln_dc_request)); LASSERT(list_empty(&the_lnet.ln_dc_working));