From 9283e2ed6655e89fe693d35313c9dcf1d5a6703a Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Sun, 14 Jul 2019 09:10:29 -0500 Subject: [PATCH] LU-12537 lnet: Sync the start of discovery and monitor threads The discovery thread starts up before the monitor thread so it may issue PUTs or GETs before the monitor thread has a chance to initialize its data structures (namely the_lnet.ln_mt_rstq). This can result in an OOPs when we attempt to attach response trackers to MDs. Introduce a completion to synchronize the startup of these threads. Signed-off-by: Chris Horn Change-Id: I5d7356269090d8cbd1eab59fa29bee7ef211832f Reviewed-on: https://review.whamcloud.com/35478 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexandr Boyko Reviewed-by: Amir Shehata Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-types.h | 5 +++++ lnet/lnet/api-ni.c | 3 +++ lnet/lnet/lib-move.c | 1 + lnet/lnet/peer.c | 11 ++++++++++- 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 0550090..c3feaea 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -1143,6 +1143,11 @@ struct lnet { /* recovery eq handler */ struct lnet_handle_eq ln_mt_eqh; + /* + * Completed when the discovery and monitor threads can enter their + * work loops + */ + struct completion ln_started; }; #endif diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index be77024..d94f9f3 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1134,6 +1134,7 @@ lnet_prepare(lnet_pid_t requested_pid) INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq); init_waitqueue_head(&the_lnet.ln_dc_waitq); LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh); + init_completion(&the_lnet.ln_started); rc = lnet_descriptor_setup(); if (rc != 0) @@ -2666,6 +2667,8 @@ LNetNIInit(lnet_pid_t requested_pid) mutex_unlock(&the_lnet.ln_api_mutex); + complete_all(&the_lnet.ln_started); + /* wait for all routers to start */ lnet_wait_router_start(); diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 9b463e2..c1d4e84 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -3446,6 +3446,7 @@ lnet_monitor_thread(void *arg) int interval; time64_t now; + wait_for_completion(&the_lnet.ln_started); /* * The monitor thread takes care of the following: * 1. Checks the aliveness of routers diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 8e3f8f6..43c8d35 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -3276,6 +3276,8 @@ static int lnet_peer_discovery(void *arg) struct lnet_peer *lp; int rc; + wait_for_completion(&the_lnet.ln_started); + CDEBUG(D_NET, "started\n"); cfs_block_allsigs(); @@ -3448,7 +3450,14 @@ void lnet_peer_discovery_stop(void) LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING); the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING; - wake_up(&the_lnet.ln_dc_waitq); + + /* In the LNetNIInit() path we may be stopping discovery before it + * entered its work loop + */ + if (!completion_done(&the_lnet.ln_started)) + complete(&the_lnet.ln_started); + else + wake_up(&the_lnet.ln_dc_waitq); wait_event(the_lnet.ln_dc_waitq, the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN); -- 1.8.3.1