Whamcloud - gitweb
b=10778,i=eeb:
authorisaac <isaac>
Fri, 31 Aug 2007 18:32:41 +0000 (18:32 +0000)
committerisaac <isaac>
Fri, 31 Aug 2007 18:32:41 +0000 (18:32 +0000)
-   closed races between lnd_shutdown and peer creation that prevent
    lnd_shutdown from finishing.
-   fixed a couple of harmless off-by-one typos when skipping initial
    frags in io vectors.
-   added a missing check for memory allocation failure in lnet_new_rtrbuf.

12 files changed:
lnet/ChangeLog
lnet/klnds/iiblnd/iiblnd.c
lnet/klnds/iiblnd/iiblnd_cb.c
lnet/klnds/o2iblnd/o2iblnd_cb.c
lnet/klnds/openiblnd/openiblnd.c
lnet/klnds/openiblnd/openiblnd_cb.c
lnet/klnds/ptllnd/ptllnd_peer.c
lnet/klnds/socklnd/socklnd.c
lnet/klnds/viblnd/viblnd.c
lnet/klnds/viblnd/viblnd_cb.c
lnet/lnet/lib-move.c
lnet/lnet/router.c

index 9199dd3..2aae90d 100644 (file)
        ptllnd    - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
 
 Severity   : normal
+Bugzilla   : 10778
+Description: kibnal_shutdown() doesn't finish; lconf --cleanup hangs
+Details    : races between lnd_shutdown and peer creation prevent 
+             lnd_shutdown from finishing.
+
+Severity   : normal
 Bugzilla   : 13279
 Description: open files rlimit 1024 reached while liblustre testing
 Details    : ulnds/socklnd must close open socket after unsuccessful
index 31bcfc6..6e1889f 100644 (file)
@@ -847,6 +847,10 @@ kibnal_add_persistent_peer (lnet_nid_t nid)
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
+        /* I'm always called with a reference on kibnal_data.kib_ni
+         * so shutdown can't have started */
+        LASSERT (kibnal_data.kib_listener_cep != NULL);
+
         peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
                 kibnal_peer_decref (peer);
index 3e67548..727ad15 100644 (file)
@@ -2507,6 +2507,15 @@ kibnal_accept (kib_conn_t **connp, IB_HANDLE cep, kib_msg_t *msg, int nob)
         
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
+        if (kibnal_data.kib_listener_cep == NULL) { /* shutdown started */
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+                kibnal_peer_decref(peer);
+                kibnal_conn_decref(conn);
+                kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES);
+                return -ESHUTDOWN;
+        }
+
         peer2 = kibnal_find_peer_locked(nid);
         if (peer2 == NULL) {
                 /* peer table takes my ref on peer */
index b9670d3..3f271df 100644 (file)
@@ -1458,6 +1458,9 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
         LASSERT (peer->ibp_connecting == 0);
         peer->ibp_connecting = 1;
 
+        /* always called with a ref on ni, which prevents ni being shutdown */
+        LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
         list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
 
         kiblnd_peer_addref(peer);
@@ -2302,6 +2305,9 @@ kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
                 LASSERT (peer->ibp_accepting == 0);
                 peer->ibp_accepting = 1;
 
+                /* I have a ref on ni that prevents it being shutdown */
+                LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
                 kiblnd_peer_addref(peer);
                 list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
 
index 3850d7e..8de9e85 100644 (file)
@@ -786,6 +786,10 @@ kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port)
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
+        /* I'm always called with a reference on kibnal_data.kib_ni
+         * so shutdown can't have started */
+        LASSERT (kibnal_data.kib_nonewpeers == 0);
+
         peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
                 kibnal_peer_decref(peer);
index be869e9..9975a91 100644 (file)
@@ -1786,6 +1786,16 @@ kibnal_accept_connreq (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
         
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
+        if (kibnal_data.kib_nonewpeers) {
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+                
+                CERROR ("Shutdown has started, drop connreq from %s\n",
+                        libcfs_nid2str(msg->ibm_srcnid));
+                kibnal_conn_decref(conn);
+                kibnal_peer_decref(peer);
+                return -ESHUTDOWN;
+        }
+
         /* Check I'm the same instance that gave the connection parameters.  
          * NB If my incarnation changes after this, the peer will get nuked and
          * we'll spot that when the connection is finally added into the peer's
index d0f0bdb..7f0acf0 100644 (file)
@@ -90,6 +90,7 @@ kptllnd_get_peer_info(int index,
 void
 kptllnd_peer_add_peertable_locked (kptl_peer_t *peer)
 {
+        LASSERT (!kptllnd_data.kptl_shutdown);
         LASSERT (kptllnd_data.kptl_n_active_peers <
                  kptllnd_data.kptl_expected_peers);
 
@@ -1065,7 +1066,19 @@ kptllnd_peer_handle_hello (ptl_process_id_t  initiator,
         }
 
         write_lock_irqsave(g_lock, flags);
+
  again:
+        if (kptllnd_data.kptl_shutdown) {
+                write_unlock_irqrestore(g_lock, flags);
+
+                CERROR ("Shutdown started, refusing connection from %s\n",
+                        libcfs_id2str(lpid));
+                kptllnd_peer_unreserve_buffers();
+                kptllnd_peer_decref(new_peer);
+                kptllnd_tx_decref(hello_tx);
+                return NULL;
+        }
+
         peer = kptllnd_id2peer_locked(lpid);
         if (peer != NULL) {
                 if (peer->peer_state == PEER_STATE_WAITING_HELLO) {
@@ -1211,6 +1224,12 @@ kptllnd_find_target(kptl_peer_t **peerp, lnet_process_id_t target)
 
         write_lock_irqsave(g_lock, flags);
  again:
+        if (kptllnd_data.kptl_shutdown) {
+                write_unlock_irqrestore(g_lock, flags);
+                rc = -ESHUTDOWN;
+                goto unwind_2;
+        }
+
         *peerp = kptllnd_id2peer_locked(target);
         if (*peerp != NULL) {
                 write_unlock_irqrestore(g_lock, flags);
index 07bbf95..66410e9 100644 (file)
@@ -456,6 +456,9 @@ ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
 
         write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
+        /* always called with a ref on ni, so shutdown can't have started */
+        LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
         peer2 = ksocknal_find_peer_locked (ni, id);
         if (peer2 != NULL) {
                 ksocknal_peer_decref(peer);
@@ -1115,6 +1118,9 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
 
                 write_lock_bh (global_lock);
 
+                /* called with a ref on ni, so shutdown can't have started */
+                LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
                 peer2 = ksocknal_find_peer_locked(ni, peerid);
                 if (peer2 == NULL) {
                         /* NB this puts an "empty" peer in the peer
@@ -2306,6 +2312,60 @@ ksocknal_base_startup (void)
 }
 
 void
+ksocknal_debug_peerhash (lnet_ni_t *ni)
+{
+        ksock_peer_t     *peer = NULL;
+        struct list_head *tmp;
+        int               i;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+                list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+                        peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+                        if (peer->ksnp_ni == ni) break;
+
+                        peer = NULL;
+                }
+        }
+
+        if (peer != NULL) {
+                ksock_route_t *route;
+                ksock_conn_t  *conn;
+
+                CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, "
+                       "closing %d, accepting %d, err %d, zcookie "LPU64", "
+                       "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id),
+                       atomic_read(&peer->ksnp_refcount),
+                       peer->ksnp_sharecount, peer->ksnp_closing,
+                       peer->ksnp_accepting, peer->ksnp_error,
+                       peer->ksnp_zc_next_cookie,
+                       !list_empty(&peer->ksnp_tx_queue),
+                       !list_empty(&peer->ksnp_zc_req_list));
+
+                list_for_each (tmp, &peer->ksnp_routes) {
+                        route = list_entry(tmp, ksock_route_t, ksnr_list);
+                        CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
+                               "del %d\n", atomic_read(&route->ksnr_refcount),
+                               route->ksnr_scheduled, route->ksnr_connecting,
+                               route->ksnr_connected, route->ksnr_deleted);
+                }
+
+                list_for_each (tmp, &peer->ksnp_conns) {
+                        conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+                        CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
+                               atomic_read(&conn->ksnc_conn_refcount),
+                               atomic_read(&conn->ksnc_sock_refcount),
+                               conn->ksnc_type, conn->ksnc_closing);
+                }
+        }
+
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+        return;
+}
+
+void
 ksocknal_shutdown (lnet_ni_t *ni)
 {
         ksock_net_t      *net = ni->ni_data;
@@ -2335,6 +2395,8 @@ ksocknal_shutdown (lnet_ni_t *ni)
                        net->ksnn_npeers);
                 cfs_pause(cfs_time_seconds(1));
 
+                ksocknal_debug_peerhash(ni);
+
                 spin_lock_bh (&net->ksnn_lock);
         }
         spin_unlock_bh (&net->ksnn_lock);
index 10774a4..09d407a 100644 (file)
@@ -653,6 +653,10 @@ kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip)
 
         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
+        /* I'm always called with a reference on kibnal_data.kib_ni
+         * so shutdown can't have started */
+        LASSERT (kibnal_data.kib_listen_handle != NULL);
+
         peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
                 kibnal_peer_decref (peer);
index a2d25f3..60572b8 100644 (file)
@@ -2188,7 +2188,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status)
                         break;
                 }
 
-                kibnal_peer_connect_failed(conn->ibc_peer, active, status);
+                kibnal_peer_connect_failed(peer, active, status);
                 kibnal_conn_disconnected(conn);
                 return;
         }
@@ -2210,8 +2210,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status)
          * peer instance... */
         kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
         list_add(&conn->ibc_list, &peer->ibp_conns);
-        kibnal_close_stale_conns_locked (conn->ibc_peer,
-                                         conn->ibc_incarnation);
+        kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation);
 
         if (!kibnal_peer_active(peer) ||        /* peer has been deleted */
             conn->ibc_comms_error != 0 ||       /* comms error */
@@ -2462,6 +2461,16 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
 
         write_lock_irqsave(g_lock, flags);
 
+        if (kibnal_data.kib_listen_handle == NULL) {
+                write_unlock_irqrestore(g_lock, flags);
+
+                CWARN ("Shutdown has started, rejecting connreq from %s\n",
+                       libcfs_nid2str(rxmsg.ibm_srcnid));
+                kibnal_peer_decref(peer);
+                reason = IBNAL_REJECT_FATAL;
+                goto reject;
+        }
+
         peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid);
         if (peer2 != NULL) {
                 /* tie-break connection race in favour of the higher NID */                
index 750deb9..fdfe792 100644 (file)
@@ -485,7 +485,7 @@ lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset
         LASSERT (!in_interrupt ());
 
         LASSERT (ndiov > 0);
-        while (doffset > diov->kiov_len) {
+        while (doffset >= diov->kiov_len) {
                 doffset -= diov->kiov_len;
                 diov++;
                 ndiov--;
@@ -493,7 +493,7 @@ lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset
         }
 
         LASSERT (nsiov > 0);
-        while (soffset > siov->kiov_len) {
+        while (soffset >= siov->kiov_len) {
                 soffset -= siov->kiov_len;
                 siov++;
                 nsiov--;
@@ -565,7 +565,7 @@ lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset
         LASSERT (!in_interrupt ());
 
         LASSERT (niov > 0);
-        while (iovoffset > iov->iov_len) {
+        while (iovoffset >= iov->iov_len) {
                 iovoffset -= iov->iov_len;
                 iov++;
                 niov--;
@@ -573,7 +573,7 @@ lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset
         }
 
         LASSERT (nkiov > 0);
-        while (kiovoffset > kiov->kiov_len) {
+        while (kiovoffset >= kiov->kiov_len) {
                 kiovoffset -= kiov->kiov_len;
                 kiov++;
                 nkiov--;
@@ -634,7 +634,7 @@ lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffs
         LASSERT (!in_interrupt ());
 
         LASSERT (nkiov > 0);
-        while (kiovoffset > kiov->kiov_len) {
+        while (kiovoffset >= kiov->kiov_len) {
                 kiovoffset -= kiov->kiov_len;
                 kiov++;
                 nkiov--;
@@ -642,7 +642,7 @@ lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffs
         }
 
         LASSERT (niov > 0);
-        while (iovoffset > iov->iov_len) {
+        while (iovoffset >= iov->iov_len) {
                 iovoffset -= iov->iov_len;
                 iov++;
                 niov--;
index 721b641..df1bfc8 100644 (file)
@@ -855,6 +855,8 @@ lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp)
         int            i;
 
         LIBCFS_ALLOC(rb, sz);
+        if (rb == NULL)
+                return NULL;
 
         rb->rb_pool = rbp;