From: Chris Horn Date: Mon, 7 Mar 2022 17:03:50 +0000 (-0600) Subject: LU-15616 lnet: ln_api_mutex deadlocks X-Git-Tag: 2.15.0-RC3~2 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=22de0bd145b649768b16dd42559d326af3c13200;p=fs%2Flustre-release.git LU-15616 lnet: ln_api_mutex deadlocks LNetNIFini() acquires the ln_api_mutex and holds onto it throughout various shutdown routines. Meanwhile, LND threads (via lnet_nid2peerni_locked()) or the discovery thread (via lnet_peer_data_present()) may need to acquire this mutex in order to progress. Address these potential deadlocks by setting the_lnet.ln_state to LNET_STATE_STOPPING earlier in LNetNIFini(), and release the mutex prior to any call into LND module or before any wait. LNetNIInit() is modified to return -ESHUTDOWN if it finds that there is a concurrent shutdown in progress. HPE-bug-id: LUS-10681 Signed-off-by: Chris Horn Change-Id: Ia8b28cc95ff71e66a0f99aed4f2c22ec9d44ce1e Reviewed-on: https://review.whamcloud.com/46727 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index ad990de..76cac6e 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1350,7 +1350,7 @@ lnet_prepare(lnet_pid_t requested_pid) } static int -lnet_unprepare (void) +lnet_unprepare(void) { /* NB no LNET_LOCK since this is the last reference. All LND instances * have shut down already, so it is safe to unlink and free all @@ -2322,14 +2322,16 @@ lnet_clear_zombies_nis_locked(struct lnet_net *net) islo = ni->ni_net->net_lnd->lnd_type == LOLND; LASSERT(!in_interrupt()); - /* Holding the mutex makes it safe for lnd_shutdown + /* Holding the LND mutex makes it safe for lnd_shutdown * to call module_put(). Module unload cannot finish * until lnet_unregister_lnd() completes, and that - * requires the mutex. + * requires the LND mutex. */ + mutex_unlock(&the_lnet.ln_api_mutex); mutex_lock(&the_lnet.ln_lnd_mutex); (net->net_lnd->lnd_shutdown)(ni); mutex_unlock(&the_lnet.ln_lnd_mutex); + mutex_lock(&the_lnet.ln_api_mutex); if (!islo) CDEBUG(D_LNI, "Removed LNI %s\n", @@ -2400,7 +2402,8 @@ lnet_shutdown_lndnets(void) /* NB called holding the global mutex */ /* All quiet on the API front */ - LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING || + the_lnet.ln_state == LNET_STATE_STOPPING); LASSERT(the_lnet.ln_refcount == 0); lnet_net_lock(LNET_LOCK_EX); @@ -2901,6 +2904,11 @@ LNetNIInit(lnet_pid_t requested_pid) CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount); + if (the_lnet.ln_state == LNET_STATE_STOPPING) { + mutex_unlock(&the_lnet.ln_api_mutex); + return -ESHUTDOWN; + } + if (the_lnet.ln_refcount > 0) { rc = the_lnet.ln_refcount++; mutex_unlock(&the_lnet.ln_api_mutex); @@ -3046,6 +3054,10 @@ LNetNIFini(void) } else { LASSERT(!the_lnet.ln_niinit_self); + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_STOPPING; + lnet_net_unlock(LNET_LOCK_EX); + lnet_fault_fini(); lnet_router_debugfs_fini(); @@ -3514,6 +3526,10 @@ static int lnet_handle_legacy_ip2nets(char *ip2nets, lnet_set_tune_defaults(tun); mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out; + } while (!list_empty(&net_head)) { net = list_entry(net_head.next, struct lnet_net, net_list); list_del_init(&net->net_list); @@ -3577,8 +3593,10 @@ int lnet_dyn_add_ni(struct lnet_ioctl_config_ni *conf) lnet_set_tune_defaults(tun); mutex_lock(&the_lnet.ln_api_mutex); - - rc = lnet_add_net_common(net, tun); + if (the_lnet.ln_state != LNET_STATE_RUNNING) + rc = -ESHUTDOWN; + else + rc = lnet_add_net_common(net, tun); mutex_unlock(&the_lnet.ln_api_mutex); @@ -3601,6 +3619,10 @@ int lnet_dyn_del_ni(struct lnet_ioctl_config_ni *conf) return -EINVAL; mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto unlock_api_mutex; + } lnet_net_lock(0); @@ -3694,6 +3716,10 @@ lnet_dyn_add_net(struct lnet_ioctl_config_data *conf) return rc == 0 ? -EINVAL : rc; mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out_unlock_clean; + } if (rc > 1) { rc = -EINVAL; /* only add one network per call */ @@ -3746,6 +3772,10 @@ lnet_dyn_del_net(__u32 net_id) return -EINVAL; mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + rc = -ESHUTDOWN; + goto out; + } lnet_net_lock(0); diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 34873e8..9aae6ac 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -4134,7 +4134,9 @@ void lnet_monitor_thr_stop(void) complete(&the_lnet.ln_mt_wait_complete); /* block until monitor thread signals that it's done */ + mutex_unlock(&the_lnet.ln_api_mutex); down(&the_lnet.ln_mt_signal); + mutex_lock(&the_lnet.ln_api_mutex); LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN); /* perform cleanup tasks */ diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index ad77384..72b1382 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -3262,12 +3262,15 @@ __must_hold(&lp->lp_lock) if (lp->lp_state & LNET_PEER_MARK_DELETED) return 0; - if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) - return -ESHUTDOWN; - spin_unlock(&lp->lp_lock); mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING || + the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) { + mutex_unlock(&the_lnet.ln_api_mutex); + spin_lock(&lp->lp_lock); + return -ESHUTDOWN; + } lnet_net_lock(LNET_LOCK_EX); /* remove the peer from the discovery work @@ -3958,8 +3961,10 @@ void lnet_peer_discovery_stop(void) else wake_up(&the_lnet.ln_dc_waitq); + mutex_unlock(&the_lnet.ln_api_mutex); wait_event(the_lnet.ln_dc_waitq, the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN); + mutex_lock(&the_lnet.ln_api_mutex); LASSERT(list_empty(&the_lnet.ln_dc_request)); LASSERT(list_empty(&the_lnet.ln_dc_working));