The discovery thread starts up before the monitor thread so it may
issue PUTs or GETs before the monitor thread has a chance to
initialize its data structures (namely the_lnet.ln_mt_rstq). This can
result in an OOPs when we attempt to attach response trackers to MDs.
Introduce a completion to synchronize the startup of these threads.
Signed-off-by: Chris Horn <hornc@cray.com>
Change-Id: I5d7356269090d8cbd1eab59fa29bee7ef211832f
Reviewed-on: https://review.whamcloud.com/35478
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexandr Boyko <c17825@cray.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
/* recovery eq handler */
struct lnet_handle_eq ln_mt_eqh;
/* recovery eq handler */
struct lnet_handle_eq ln_mt_eqh;
+ /*
+ * Completed when the discovery and monitor threads can enter their
+ * work loops
+ */
+ struct completion ln_started;
INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
init_waitqueue_head(&the_lnet.ln_dc_waitq);
LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
init_waitqueue_head(&the_lnet.ln_dc_waitq);
LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
+ init_completion(&the_lnet.ln_started);
rc = lnet_descriptor_setup();
if (rc != 0)
rc = lnet_descriptor_setup();
if (rc != 0)
mutex_unlock(&the_lnet.ln_api_mutex);
mutex_unlock(&the_lnet.ln_api_mutex);
+ complete_all(&the_lnet.ln_started);
+
/* wait for all routers to start */
lnet_wait_router_start();
/* wait for all routers to start */
lnet_wait_router_start();
int interval;
time64_t now;
int interval;
time64_t now;
+ wait_for_completion(&the_lnet.ln_started);
/*
* The monitor thread takes care of the following:
* 1. Checks the aliveness of routers
/*
* The monitor thread takes care of the following:
* 1. Checks the aliveness of routers
struct lnet_peer *lp;
int rc;
struct lnet_peer *lp;
int rc;
+ wait_for_completion(&the_lnet.ln_started);
+
CDEBUG(D_NET, "started\n");
cfs_block_allsigs();
CDEBUG(D_NET, "started\n");
cfs_block_allsigs();
LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING);
the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING;
LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING);
the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING;
- wake_up(&the_lnet.ln_dc_waitq);
+
+ /* In the LNetNIInit() path we may be stopping discovery before it
+ * entered its work loop
+ */
+ if (!completion_done(&the_lnet.ln_started))
+ complete(&the_lnet.ln_started);
+ else
+ wake_up(&the_lnet.ln_dc_waitq);
wait_event(the_lnet.ln_dc_waitq,
the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
wait_event(the_lnet.ln_dc_waitq,
the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);