Whamcloud - gitweb
LU-1600 lnet: peer creation has race with shutdown
authorLiang Zhen <liang@whamcloud.com>
Thu, 5 Jul 2012 05:10:08 +0000 (13:10 +0800)
committerOleg Drokin <green@whamcloud.com>
Fri, 6 Jul 2012 16:29:21 +0000 (12:29 -0400)
lnet_nid2peer_locked()->lnet_find_peer_locked() will get NULL if
LNet's in progress of shutting down, then it will try to allocate
a new peer and insert it into peer table. If one thread is doing this
and another thread could have already finalized everything of LNet,
so the first thread will crash system after allocation.

The solution is add an extra refcount on peer-table (number of peers)
before allocating new peer, because the shutting down thread always
needs to wait until peers number to be zero before going to the
next step of finalization.

This bug is not introduced by new LNet, but it can be exposed
easily by new LNet.

Signed-off-by: Liang Zhen <liang@whamcloud.com>
Change-Id: I5c8d26f08ce56092bee1b4bae5111fdfe1e9c12b
Reviewed-on: http://review.whamcloud.com/3274
Reviewed-by: Bobi Jam <bobijam@whamcloud.com>
Tested-by: Hudson
Reviewed-by: Doug Oucharek <doug@whamcloud.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/peer.c

index bca85f2..3c5ee47 100644 (file)
@@ -207,16 +207,17 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
        lnet_peer_t             *lp = NULL;
        lnet_peer_t             *lp2;
        int                     cpt2;
+       int                     rc = 0;
 
        /* cpt can be LNET_LOCK_EX if it's called from router functions */
        cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid);
 
        ptable = the_lnet.ln_peer_tables[cpt2];
        lp = lnet_find_peer_locked(ptable, nid);
-        if (lp != NULL) {
-                *lpp = lp;
-                return 0;
-        }
+       if (lp != NULL) {
+               *lpp = lp;
+               return 0;
+       }
 
        if (!cfs_list_empty(&ptable->pt_deathrow)) {
                lp = cfs_list_entry(ptable->pt_deathrow.next,
@@ -224,6 +225,12 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
                cfs_list_del(&lp->lp_hashlist);
        }
 
+       *lpp = NULL;
+       /*
+        * take extra refcount in case another thread has shutdown LNet
+        * and destroyed locks and peer-table before I finish the allocation
+        */
+       ptable->pt_number++;
        lnet_net_unlock(cpt);
 
        if (lp != NULL)
@@ -232,10 +239,10 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
                LIBCFS_CPT_ALLOC(lp, lnet_cpt_table(), cpt2, sizeof(*lp));
 
        if (lp == NULL) {
-                *lpp = NULL;
-                LNET_LOCK();
-                return -ENOMEM;
-        }
+               rc = -ENOMEM;
+               lnet_net_lock(cpt);
+               goto out;
+       }
 
        CFS_INIT_LIST_HEAD(&lp->lp_txq);
        CFS_INIT_LIST_HEAD(&lp->lp_rtrq);
@@ -253,48 +260,44 @@ lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid, int cpt)
        lp->lp_ping_feats = LNET_PING_FEAT_INVAL;
        lp->lp_nid = nid;
        lp->lp_cpt = cpt2;
-       lp->lp_refcount = 2;                    /* 1 for caller; 1 for hash */
+       lp->lp_refcount = 2;    /* 1 for caller; 1 for hash */
        lp->lp_rtr_refcount = 0;
 
        lnet_net_lock(cpt);
 
+       if (the_lnet.ln_shutdown) {
+               rc = -ESHUTDOWN;
+               goto out;
+       }
+
        lp2 = lnet_find_peer_locked(ptable, nid);
        if (lp2 != NULL) {
-               cfs_list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
-
-                if (the_lnet.ln_shutdown) {
-                        lnet_peer_decref_locked(lp2);
-                        *lpp = NULL;
-                        return -ESHUTDOWN;
-                }
-
-                *lpp = lp2;
-                return 0;
-        }
+               *lpp = lp2;
+               goto out;
+       }
 
        lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2);
        if (lp->lp_ni == NULL) {
-               cfs_list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
-
-                *lpp = NULL;
-                return the_lnet.ln_shutdown ? -ESHUTDOWN : -EHOSTUNREACH;
-        }
-
-        lp->lp_txcredits    =
-        lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
-        lp->lp_rtrcredits    =
-        lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
+               rc = -EHOSTUNREACH;
+               goto out;
+       }
 
-        /* can't add peers after shutdown starts */
-        LASSERT (!the_lnet.ln_shutdown);
+       lp->lp_txcredits    =
+       lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+       lp->lp_rtrcredits    =
+       lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni);
 
        cfs_list_add_tail(&lp->lp_hashlist,
                          &ptable->pt_hash[lnet_nid2peerhash(nid)]);
        ptable->pt_version++;
-       ptable->pt_number++;
-
        *lpp = lp;
+
        return 0;
+out:
+       if (lp != NULL)
+               cfs_list_add(&lp->lp_hashlist, &ptable->pt_deathrow);
+       ptable->pt_number--;
+       return rc;
 }
 
 void