Whamcloud - gitweb
LU-1600 lnet: another race in lnet_nid2peer_locked
authorLiang Zhen <liang@whamcloud.com>
Tue, 10 Jul 2012 16:02:10 +0000 (00:02 +0800)
committerOleg Drokin <green@whamcloud.com>
Thu, 19 Jul 2012 04:10:25 +0000 (00:10 -0400)
We fixed a race for the case that LNet is shutdown while the second
thread is in progress of creating a peer, but there is another race
if LNet is shutdown _before_ lnet_nid2peer_locked is called by the
second thread, for example: called from lnet_parse() by LND thread,
in this case, we should return error w/o trying to create peer,
otherwise the shuttingdown thread could think peer-table is empty
and move to the next step with leaving a peer on deathrow list.

This patch also passed error code into lnet_finalize() when it's
called from lnet_drop_delayed_msg_list, otherwise lnet_finalize()
will treat the message as a complete message and hit assertion
because the message is actually incomplet.

We also did some coding style cleanup in this patch.

Signed-off-by: Liang Zhen <liang@whamcloud.com>
Change-Id: I445298d639173840687412b11da41006ddc20c10
Reviewed-on: http://review.whamcloud.com/3369
Tested-by: Hudson
Reviewed-by: Bobi Jam <bobijam@whamcloud.com>
Reviewed-by: Doug Oucharek <doug@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/lib-move.c
lnet/lnet/peer.c
lnet/lnet/router.c

index e8dc8a0..f564796 100644 (file)
@@ -1278,11 +1278,11 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid)
                LASSERT(src_nid != LNET_NID_ANY);
                lnet_msg_commit(msg, cpt);
 
                LASSERT(src_nid != LNET_NID_ANY);
                lnet_msg_commit(msg, cpt);
 
-                if (!msg->msg_routing)
-                        msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+               if (!msg->msg_routing)
+                       msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
 
 
-                if (src_ni == the_lnet.ln_loni) {
-                        /* No send credit hassles with LOLND */
+               if (src_ni == the_lnet.ln_loni) {
+                       /* No send credit hassles with LOLND */
                        lnet_net_unlock(cpt);
                        lnet_ni_send(src_ni, msg);
 
                        lnet_net_unlock(cpt);
                        lnet_ni_send(src_ni, msg);
 
@@ -1416,8 +1416,8 @@ lnet_recv_put(lnet_ni_t *ni, lnet_msg_t *msg)
 
        lnet_build_msg_event(msg, LNET_EVENT_PUT);
 
 
        lnet_build_msg_event(msg, LNET_EVENT_PUT);
 
-        /* Must I ACK?  If so I'll grab the ack_wmd out of the header and put
-         * it back into the ACK during lnet_finalize() */
+       /* Must I ACK?  If so I'll grab the ack_wmd out of the header and put
+        * it back into the ACK during lnet_finalize() */
        msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
                        (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
 
        msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
                        (msg->msg_md->md_options & LNET_MD_ACK_DISABLE) == 0);
 
@@ -1531,7 +1531,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
        rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
        if (rc < 0) {
                /* didn't get as far as lnet_ni_send() */
        rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY);
        if (rc < 0) {
                /* didn't get as far as lnet_ni_send() */
-                CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+               CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
                       libcfs_nid2str(ni->ni_nid),
                       libcfs_id2str(info.mi_id), rc);
 
                       libcfs_nid2str(ni->ni_nid),
                       libcfs_id2str(info.mi_id), rc);
 
@@ -2021,9 +2021,12 @@ lnet_drop_delayed_msg_list(cfs_list_t *head, char *reason)
                lnet_drop_message(msg->msg_rxpeer->lp_ni,
                                  msg->msg_rxpeer->lp_cpt,
                                  msg->msg_private, msg->msg_len);
                lnet_drop_message(msg->msg_rxpeer->lp_ni,
                                  msg->msg_rxpeer->lp_cpt,
                                  msg->msg_private, msg->msg_len);
-               /* NB: message will not generate event because w/o attached MD,
-                * so we just use 0 as the third parameter */
-               lnet_finalize(msg->msg_rxpeer->lp_ni, msg, 0);
+               /*
+                * NB: message will not generate event because w/o attached MD,
+                * but we still should give error code so lnet_msg_decommit()
+                * can skip counters operations and other checks.
+                */
+               lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT);
        }
 }
 
        }
 }
 
index 3c5ee47..325be18 100644 (file)
@@ -186,8 +186,7 @@ lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid)
        cfs_list_t      *peers;
        lnet_peer_t     *lp;
 
        cfs_list_t      *peers;
        lnet_peer_t     *lp;
 
-       if (the_lnet.ln_shutdown)
-               return NULL;
+       LASSERT(!the_lnet.ln_shutdown);
 
        peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
        cfs_list_for_each_entry(lp, peers, lp_hashlist) {
 
        peers = &ptable->pt_hash[lnet_nid2peerhash(nid)];
        cfs_list_for_each_entry(lp, peers, lp_hashlist) {
@@ -209,6 +208,10 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
        int                     cpt2;
        int                     rc = 0;
 
        int                     cpt2;
        int                     rc = 0;
 
+       *lpp = NULL;
+       if (the_lnet.ln_shutdown) /* it's shutting down */
+               return -ESHUTDOWN;
+
        /* cpt can be LNET_LOCK_EX if it's called from router functions */
        cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
 
        /* cpt can be LNET_LOCK_EX if it's called from router functions */
        cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
 
@@ -225,7 +228,6 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
                cfs_list_del(&lp->lp_hashlist);
        }
 
                cfs_list_del(&lp->lp_hashlist);
        }
 
-       *lpp = NULL;
        /*
         * take extra refcount in case another thread has shutdown LNet
         * and destroyed locks and peer-table before I finish the allocation
        /*
         * take extra refcount in case another thread has shutdown LNet
         * and destroyed locks and peer-table before I finish the allocation
index 7c2a453..a755db6 100644 (file)
@@ -1572,6 +1572,11 @@ lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 
        lnet_net_lock(cpt);
 
 
        lnet_net_lock(cpt);
 
+       if (the_lnet.ln_shutdown) {
+               lnet_net_unlock(cpt);
+               return -ESHUTDOWN;
+       }
+
        lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
        if (lp == NULL) {
                /* nid not found */
        lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
        if (lp == NULL) {
                /* nid not found */