From: Amir Shehata Date: Tue, 30 Apr 2019 05:57:21 +0000 (-0700) Subject: LU-12249 lnet: fix list corruption X-Git-Tag: 2.12.55~25^2~35 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=d799ac910cd6c980b40c81b76eaefb65b88904d0 LU-12249 lnet: fix list corruption In shutdown the resend queues are cleared and freed. The monitor thread state is set to shutdown. It is possible to get lnet_finalize() called after the queues are freed. The code checks for ln_state to see if we're shutting down. But in this case we should really be checking ln_mt_state. The monitor thread is the one that matters in this case, because it's the one which allocates and frees the resend queues. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: Ia077cec7a52ef5cd2e1b231437c6265ba9416b1b Reviewed-on: https://review.whamcloud.com/34778 Reviewed-by: Olaf Weber Reviewed-by: Sebastien Buisson Reviewed-by: Chris Horn Tested-by: Jenkins --- diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 8e8734b..df230bf 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -3389,7 +3389,9 @@ lnet_monitor_thread(void *arg) lnet_prune_rc_data(1); /* Shutting down */ + lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN; + lnet_net_unlock(LNET_LOCK_EX); /* signal that the monitor thread is exiting */ up(&the_lnet.ln_mt_signal); @@ -3605,7 +3607,9 @@ int lnet_monitor_thr_start(void) sema_init(&the_lnet.ln_mt_signal, 0); + lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_mt_state = LNET_MT_STATE_RUNNING; + lnet_net_unlock(LNET_LOCK_EX); task = kthread_run(lnet_monitor_thread, NULL, "monitor_thread"); if (IS_ERR(task)) { rc = PTR_ERR(task); @@ -3619,13 +3623,17 @@ int lnet_monitor_thr_start(void) return 0; clean_thread: + lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING; + lnet_net_unlock(LNET_LOCK_EX); /* block until event callback signals exit */ down(&the_lnet.ln_mt_signal); /* clean up */ lnet_router_cleanup(); free_mem: + lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN; + lnet_net_unlock(LNET_LOCK_EX); lnet_rsp_tracker_clean(); lnet_clean_local_ni_recoveryq(); lnet_clean_peer_ni_recoveryq(); @@ -3646,7 +3654,9 @@ void lnet_monitor_thr_stop(void) return; LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING); + lnet_net_lock(LNET_LOCK_EX); the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING; + lnet_net_unlock(LNET_LOCK_EX); /* tell the monitor thread that we're shutting down */ wake_up(&the_lnet.ln_mt_waitq); diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 74491a2..32aea2c 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -600,7 +600,7 @@ lnet_health_check(struct lnet_msg *msg) bool lo = false; /* if we're shutting down no point in handling health. */ - if (the_lnet.ln_state != LNET_STATE_RUNNING) + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) return -1; LASSERT(msg->msg_txni); @@ -714,6 +714,12 @@ resend: lnet_net_lock(msg->msg_tx_cpt); + /* check again under lock */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) { + lnet_net_unlock(msg->msg_tx_cpt); + return -1; + } + /* * remove message from the active list and reset it in preparation * for a resend. Two exception to this