From: Olaf Weber Date: Fri, 27 Jan 2017 15:13:29 +0000 (+0100) Subject: LU-9119 lnet: fix race in lnet shutdown path X-Git-Tag: 2.9.57~5 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=1612925723908f4eb4bc2cefe677b7825027fe7f LU-9119 lnet: fix race in lnet shutdown path The locking changes for the lnet_net_lock made for Multi-Rail introduce a race in the LNet shutdown path. The code keeps two states in the_lnet.ln_shutdown: 0 means LNet is either up and running or shut down, while 1 means lnet is shutting down. In lnet_select_pathway() if we need to restart and drop and relock the lnet_net_lock we can find that LNet went from running to stopped, and not be able to tell the difference. Replace ln_shutdown with a three-state ln_state patterned on ln_rc_state: states are LNET_STATE_SHUTDOWN, LNET_STATE_RUNNING, and LNET_STATE_STOPPING. Most checks against ln_shutdown now test ln_state against LNET_STATE_RUNNING. LNet moves to RUNNING state in lnet_startup_lndnets(). Test-Parameters: trivial Signed-off-by: Olaf Weber Change-Id: I7afcbeb793dfa4d0a361e421ae06a99b7d4db903 Reviewed-on: https://review.whamcloud.com/26690 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Doug Oucharek Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index cfa4c39..c72771c 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -734,6 +734,11 @@ struct lnet_msg_container { #define LNET_RC_STATE_RUNNING 1 /* started up OK */ #define LNET_RC_STATE_STOPPING 2 /* telling thread to stop */ +/* LNet states */ +#define LNET_STATE_SHUTDOWN 0 /* not started */ +#define LNET_STATE_RUNNING 1 /* started up OK */ +#define LNET_STATE_STOPPING 2 /* telling thread to stop */ + typedef struct lnet { /* CPU partition table of LNet */ struct cfs_cpt_table *ln_cpt_table; @@ -812,8 +817,8 @@ typedef struct lnet { int ln_niinit_self; /* LNetNIInit/LNetNIFini counter */ int ln_refcount; - /* shutdown in progress */ - int ln_shutdown; + /* SHUTDOWN/RUNNING/STOPPING */ + int ln_state; int ln_routing; /* am I a router? */ lnet_pid_t ln_pid; /* requested pid */ diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 204fe49..e63feac 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1354,11 +1354,11 @@ lnet_shutdown_lndnets(void) /* NB called holding the global mutex */ /* All quiet on the API front */ - LASSERT(!the_lnet.ln_shutdown); + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); LASSERT(the_lnet.ln_refcount == 0); lnet_net_lock(LNET_LOCK_EX); - the_lnet.ln_shutdown = 1; /* flag shutdown */ + the_lnet.ln_state = LNET_STATE_STOPPING; while (!list_empty(&the_lnet.ln_nets)) { /* @@ -1386,7 +1386,7 @@ lnet_shutdown_lndnets(void) } lnet_net_lock(LNET_LOCK_EX); - the_lnet.ln_shutdown = 0; + the_lnet.ln_state = LNET_STATE_SHUTDOWN; lnet_net_unlock(LNET_LOCK_EX); } @@ -1657,6 +1657,15 @@ lnet_startup_lndnets(struct list_head *netlist) int rc; int ni_count = 0; + /* + * Change to running state before bringing up the LNDs. This + * allows lnet_shutdown_lndnets() to assert that we've passed + * through here. + */ + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_state = LNET_STATE_RUNNING; + lnet_net_unlock(LNET_LOCK_EX); + while (!list_empty(netlist)) { net = list_entry(netlist->next, struct lnet_net, net_list); list_del_init(&net->net_list); diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 5f4045e..ec3051c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1423,7 +1423,7 @@ again: seq = lnet_get_dlc_seq_locked(); - if (the_lnet.ln_shutdown) { + if (the_lnet.ln_state != LNET_STATE_RUNNING) { lnet_net_unlock(cpt); return -ESHUTDOWN; } diff --git a/lnet/lnet/lib-ptl.c b/lnet/lnet/lib-ptl.c index ec5151e..3773ed9 100644 --- a/lnet/lnet/lib-ptl.c +++ b/lnet/lnet/lib-ptl.c @@ -584,7 +584,7 @@ lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg) mtable = lnet_mt_of_match(info, msg); lnet_res_lock(mtable->mt_cpt); - if (the_lnet.ln_shutdown) { + if (the_lnet.ln_state != LNET_STATE_RUNNING) { rc = LNET_MATCHMD_DROP; goto out1; } @@ -946,7 +946,7 @@ lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason) list_move(&msg->msg_list, &zombies); } } else { - if (the_lnet.ln_shutdown) + if (the_lnet.ln_state != LNET_STATE_RUNNING) CWARN("Active lazy portal %d on exit\n", portal); else CDEBUG(D_NET, "clearing portal %d lazy\n", portal); diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 7b506a4..47e5ce2 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -427,7 +427,7 @@ lnet_peer_tables_cleanup(struct lnet_net *net) int i; struct lnet_peer_table *ptable; - LASSERT(the_lnet.ln_shutdown || net != NULL); + LASSERT(the_lnet.ln_state != LNET_STATE_SHUTDOWN || net != NULL); /* If just deleting the peers for a NI, get rid of any routes these * peers are gateways for. */ cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { @@ -453,7 +453,7 @@ lnet_get_peer_ni_locked(struct lnet_peer_table *ptable, lnet_nid_t nid) struct list_head *peers; struct lnet_peer_ni *lp; - LASSERT(!the_lnet.ln_shutdown); + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; list_for_each_entry(lp, peers, lpni_hashlist) { @@ -994,7 +994,7 @@ lnet_nid2peerni_ex(lnet_nid_t nid, int cpt) struct lnet_peer_ni *lpni = NULL; int rc; - if (the_lnet.ln_shutdown) /* it's shutting down */ + if (the_lnet.ln_state != LNET_STATE_RUNNING) return ERR_PTR(-ESHUTDOWN); /* @@ -1028,7 +1028,7 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt) struct lnet_peer_ni *lpni = NULL; int rc; - if (the_lnet.ln_shutdown) /* it's shutting down */ + if (the_lnet.ln_state != LNET_STATE_RUNNING) return ERR_PTR(-ESHUTDOWN); /* @@ -1057,7 +1057,7 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt) * Shutdown is only set under the ln_api_lock, so a single * check here is sufficent. */ - if (the_lnet.ln_shutdown) { + if (the_lnet.ln_state != LNET_STATE_RUNNING) { lpni = ERR_PTR(-ESHUTDOWN); goto out_mutex_unlock; } diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 0e929bf..aba5699 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -249,7 +249,7 @@ lnet_find_rnet_locked(__u32 net) struct list_head *tmp; struct list_head *rn_list; - LASSERT(!the_lnet.ln_shutdown); + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); rn_list = lnet_net2rnethash(net); list_for_each(tmp, rn_list) { @@ -381,7 +381,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway, return rc; } route->lr_gateway = lpni; - LASSERT(!the_lnet.ln_shutdown); + LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); rnet2 = lnet_find_rnet_locked(net); if (rnet2 == NULL) { @@ -1804,7 +1804,7 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, cfs_time_t when) lnet_net_lock(cpt); - if (the_lnet.ln_shutdown) { + if (the_lnet.ln_state != LNET_STATE_RUNNING) { lnet_net_unlock(cpt); return -ESHUTDOWN; }