Whamcloud - gitweb
LU-12537 lnet: Sync the start of discovery and monitor threads 78/35478/3
authorChris Horn <hornc@cray.com>
Sun, 14 Jul 2019 14:10:29 +0000 (09:10 -0500)
committerOleg Drokin <green@whamcloud.com>
Fri, 9 Aug 2019 04:39:38 +0000 (04:39 +0000)
The discovery thread starts up before the monitor thread so it may
issue PUTs or GETs before the monitor thread has a chance to
initialize its data structures (namely the_lnet.ln_mt_rstq). This can
result in an OOPs when we attempt to attach response trackers to MDs.

Introduce a completion to synchronize the startup of these threads.

Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: I5d7356269090d8cbd1eab59fa29bee7ef211832f
Reviewed-on: https://review.whamcloud.com/35478
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-types.h
lnet/lnet/api-ni.c
lnet/lnet/lib-move.c
lnet/lnet/peer.c

index 0550090..c3feaea 100644 (file)
@@ -1143,6 +1143,11 @@ struct lnet {
        /* recovery eq handler */
        struct lnet_handle_eq           ln_mt_eqh;
 
        /* recovery eq handler */
        struct lnet_handle_eq           ln_mt_eqh;
 
+       /*
+        * Completed when the discovery and monitor threads can enter their
+        * work loops
+        */
+       struct completion               ln_started;
 };
 
 #endif
 };
 
 #endif
index be77024..d94f9f3 100644 (file)
@@ -1134,6 +1134,7 @@ lnet_prepare(lnet_pid_t requested_pid)
        INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
        init_waitqueue_head(&the_lnet.ln_dc_waitq);
        LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
        INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
        init_waitqueue_head(&the_lnet.ln_dc_waitq);
        LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
+       init_completion(&the_lnet.ln_started);
 
        rc = lnet_descriptor_setup();
        if (rc != 0)
 
        rc = lnet_descriptor_setup();
        if (rc != 0)
@@ -2666,6 +2667,8 @@ LNetNIInit(lnet_pid_t requested_pid)
 
        mutex_unlock(&the_lnet.ln_api_mutex);
 
 
        mutex_unlock(&the_lnet.ln_api_mutex);
 
+       complete_all(&the_lnet.ln_started);
+
        /* wait for all routers to start */
        lnet_wait_router_start();
 
        /* wait for all routers to start */
        lnet_wait_router_start();
 
index 9b463e2..c1d4e84 100644 (file)
@@ -3446,6 +3446,7 @@ lnet_monitor_thread(void *arg)
        int interval;
        time64_t now;
 
        int interval;
        time64_t now;
 
+       wait_for_completion(&the_lnet.ln_started);
        /*
         * The monitor thread takes care of the following:
         *  1. Checks the aliveness of routers
        /*
         * The monitor thread takes care of the following:
         *  1. Checks the aliveness of routers
index 8e3f8f6..43c8d35 100644 (file)
@@ -3276,6 +3276,8 @@ static int lnet_peer_discovery(void *arg)
        struct lnet_peer *lp;
        int rc;
 
        struct lnet_peer *lp;
        int rc;
 
+       wait_for_completion(&the_lnet.ln_started);
+
        CDEBUG(D_NET, "started\n");
        cfs_block_allsigs();
 
        CDEBUG(D_NET, "started\n");
        cfs_block_allsigs();
 
@@ -3448,7 +3450,14 @@ void lnet_peer_discovery_stop(void)
 
        LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING);
        the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING;
 
        LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING);
        the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING;
-       wake_up(&the_lnet.ln_dc_waitq);
+
+       /* In the LNetNIInit() path we may be stopping discovery before it
+        * entered its work loop
+        */
+       if (!completion_done(&the_lnet.ln_started))
+               complete(&the_lnet.ln_started);
+       else
+               wake_up(&the_lnet.ln_dc_waitq);
 
        wait_event(the_lnet.ln_dc_waitq,
                   the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
 
        wait_event(the_lnet.ln_dc_waitq,
                   the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);