Whamcloud - gitweb
LU-12249 lnet: fix list corruption 78/34778/8
authorAmir Shehata <ashehata@whamcloud.com>
Tue, 30 Apr 2019 05:57:21 +0000 (22:57 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 7 Jun 2019 18:07:52 +0000 (18:07 +0000)
In shutdown the resend queues are cleared and freed. The monitor
thread state is set to shutdown. It is possible to get lnet_finalize()
called after the queues are freed. The code checks for ln_state to see
if we're shutting down. But in this case we should really be checking
ln_mt_state. The monitor thread is the one that matters in this case,
because it's the one which allocates and frees the resend queues.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ia077cec7a52ef5cd2e1b231437c6265ba9416b1b
Reviewed-on: https://review.whamcloud.com/34778
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Chris Horn <hornc@cray.com>
Tested-by: Jenkins
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c

index 8e8734b..df230bf 100644 (file)
@@ -3389,7 +3389,9 @@ lnet_monitor_thread(void *arg)
        lnet_prune_rc_data(1);
 
        /* Shutting down */
        lnet_prune_rc_data(1);
 
        /* Shutting down */
+       lnet_net_lock(LNET_LOCK_EX);
        the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
        the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
+       lnet_net_unlock(LNET_LOCK_EX);
 
        /* signal that the monitor thread is exiting */
        up(&the_lnet.ln_mt_signal);
 
        /* signal that the monitor thread is exiting */
        up(&the_lnet.ln_mt_signal);
@@ -3605,7 +3607,9 @@ int lnet_monitor_thr_start(void)
 
        sema_init(&the_lnet.ln_mt_signal, 0);
 
 
        sema_init(&the_lnet.ln_mt_signal, 0);
 
+       lnet_net_lock(LNET_LOCK_EX);
        the_lnet.ln_mt_state = LNET_MT_STATE_RUNNING;
        the_lnet.ln_mt_state = LNET_MT_STATE_RUNNING;
+       lnet_net_unlock(LNET_LOCK_EX);
        task = kthread_run(lnet_monitor_thread, NULL, "monitor_thread");
        if (IS_ERR(task)) {
                rc = PTR_ERR(task);
        task = kthread_run(lnet_monitor_thread, NULL, "monitor_thread");
        if (IS_ERR(task)) {
                rc = PTR_ERR(task);
@@ -3619,13 +3623,17 @@ int lnet_monitor_thr_start(void)
        return 0;
 
 clean_thread:
        return 0;
 
 clean_thread:
+       lnet_net_lock(LNET_LOCK_EX);
        the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
        the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
+       lnet_net_unlock(LNET_LOCK_EX);
        /* block until event callback signals exit */
        down(&the_lnet.ln_mt_signal);
        /* clean up */
        lnet_router_cleanup();
 free_mem:
        /* block until event callback signals exit */
        down(&the_lnet.ln_mt_signal);
        /* clean up */
        lnet_router_cleanup();
 free_mem:
+       lnet_net_lock(LNET_LOCK_EX);
        the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
        the_lnet.ln_mt_state = LNET_MT_STATE_SHUTDOWN;
+       lnet_net_unlock(LNET_LOCK_EX);
        lnet_rsp_tracker_clean();
        lnet_clean_local_ni_recoveryq();
        lnet_clean_peer_ni_recoveryq();
        lnet_rsp_tracker_clean();
        lnet_clean_local_ni_recoveryq();
        lnet_clean_peer_ni_recoveryq();
@@ -3646,7 +3654,9 @@ void lnet_monitor_thr_stop(void)
                return;
 
        LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
                return;
 
        LASSERT(the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING);
+       lnet_net_lock(LNET_LOCK_EX);
        the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
        the_lnet.ln_mt_state = LNET_MT_STATE_STOPPING;
+       lnet_net_unlock(LNET_LOCK_EX);
 
        /* tell the monitor thread that we're shutting down */
        wake_up(&the_lnet.ln_mt_waitq);
 
        /* tell the monitor thread that we're shutting down */
        wake_up(&the_lnet.ln_mt_waitq);
index 74491a2..32aea2c 100644 (file)
@@ -600,7 +600,7 @@ lnet_health_check(struct lnet_msg *msg)
        bool lo = false;
 
        /* if we're shutting down no point in handling health. */
        bool lo = false;
 
        /* if we're shutting down no point in handling health. */
-       if (the_lnet.ln_state != LNET_STATE_RUNNING)
+       if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
                return -1;
 
        LASSERT(msg->msg_txni);
                return -1;
 
        LASSERT(msg->msg_txni);
@@ -714,6 +714,12 @@ resend:
 
        lnet_net_lock(msg->msg_tx_cpt);
 
 
        lnet_net_lock(msg->msg_tx_cpt);
 
+       /* check again under lock */
+       if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
+               lnet_net_unlock(msg->msg_tx_cpt);
+               return -1;
+       }
+
        /*
         * remove message from the active list and reset it in preparation
         * for a resend. Two exception to this
        /*
         * remove message from the active list and reset it in preparation
         * for a resend. Two exception to this