From: Liang Zhen Date: Tue, 10 Jul 2012 16:02:10 +0000 (+0800) Subject: LU-1600 lnet: another race in lnet_nid2peer_locked X-Git-Tag: 2.2.91~7 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=3ba222d5230cd89713aaf11cf0d8a8bc7ca62107 LU-1600 lnet: another race in lnet_nid2peer_locked We fixed a race for the case that LNet is shutdown while the second thread is in progress of creating a peer, but there is another race if LNet is shutdown _before_ lnet_nid2peer_locked is called by the second thread, for example: called from lnet_parse() by LND thread, in this case, we should return error w/o trying to create peer, otherwise the shuttingdown thread could think peer-table is empty and move to the next step with leaving a peer on deathrow list. This patch also passed error code into lnet_finalize() when it's called from lnet_drop_delayed_msg_list, otherwise lnet_finalize() will treat the message as a complete message and hit assertion because the message is actually incomplet. We also did some coding style cleanup in this patch. Signed-off-by: Liang Zhen Change-Id: I445298d639173840687412b11da41006ddc20c10 Reviewed-on: http://review.whamcloud.com/3369 Tested-by: Hudson Reviewed-by: Bobi Jam Reviewed-by: Doug Oucharek Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index e8dc8a0..f564796 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1278,11 +1278,11 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) LASSERT(src_nid != LNET_NID_ANY); lnet_msg_commit(msg, cpt); - if (!msg->msg_routing) - msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + if (!msg->msg_routing) + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); - if (src_ni == the_lnet.ln_loni) { - /* No send credit hassles with LOLND */ + if (src_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ lnet_net_unlock(cpt); lnet_ni_send(src_ni, msg); @@ -1416,8 +1416,8 @@ lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg) lnet_build_msg_event(msg, LNET_EVENT_PUT); - /* Must I ACK? If so I'll grab the ack_wmd out of the header and put - * it back into the ACK during lnet_finalize() */ + /* Must I ACK? If so I'll grab the ack_wmd out of the header and put + * it back into the ACK during lnet_finalize() */ msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0); @@ -1531,7 +1531,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY); if (rc < 0) { /* didn't get as far as lnet_ni_send() */ - CERROR("%s: Unable to send REPLY for GET from %s: %d\n", + CERROR("%s: Unable to send REPLY for GET from %s: %d\n", libcfs_nid2str(ni->ni_nid), libcfs_id2str(info.mi_id), rc); @@ -2021,9 +2021,12 @@ lnet_drop_delayed_msg_list(cfs_list_t *head, char *reason) lnet_drop_message(msg->msg_rxpeer->lp_ni, msg->msg_rxpeer->lp_cpt, msg->msg_private, msg->msg_len); - /* NB: message will not generate event because w/o attached MD, - * so we just use 0 as the third parameter */ - lnet_finalize(msg->msg_rxpeer->lp_ni, msg, 0); + /* + * NB: message will not generate event because w/o attached MD, + * but we still should give error code so lnet_msg_decommit() + * can skip counters operations and other checks. + */ + lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT); } } diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 3c5ee47..325be18 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -186,8 +186,7 @@ lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid) cfs_list_t *peers; lnet_peer_t *lp; - if (the_lnet.ln_shutdown) - return NULL; + LASSERT(!the_lnet.ln_shutdown); peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; cfs_list_for_each_entry(lp, peers, lp_hashlist) { @@ -209,6 +208,10 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt) int cpt2; int rc = 0; + *lpp = NULL; + if (the_lnet.ln_shutdown) /* it's shutting down */ + return -ESHUTDOWN; + /* cpt can be LNET_LOCK_EX if it's called from router functions */ cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid); @@ -225,7 +228,6 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt) cfs_list_del(&lp->lp_hashlist); } - *lpp = NULL; /* * take extra refcount in case another thread has shutdown LNet * and destroyed locks and peer-table before I finish the allocation diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 7c2a453..a755db6 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -1572,6 +1572,11 @@ lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) lnet_net_lock(cpt); + if (the_lnet.ln_shutdown) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid); if (lp == NULL) { /* nid not found */