From: isaac Date: Fri, 31 Aug 2007 18:32:41 +0000 (+0000) Subject: b=10778,i=eeb: X-Git-Tag: v1_7_0_51~813 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=682bb58318d7d5d101febfc6770c95887c750e2e b=10778,i=eeb: - closed races between lnd_shutdown and peer creation that prevent lnd_shutdown from finishing. - fixed a couple of harmless off-by-one typos when skipping initial frags in io vectors. - added a missing check for memory allocation failure in lnet_new_rtrbuf. --- diff --git a/lnet/ChangeLog b/lnet/ChangeLog index 9199dd3..2aae90d 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -13,6 +13,12 @@ ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x Severity : normal +Bugzilla : 10778 +Description: kibnal_shutdown() doesn't finish; lconf --cleanup hangs +Details : races between lnd_shutdown and peer creation prevent + lnd_shutdown from finishing. + +Severity : normal Bugzilla : 13279 Description: open files rlimit 1024 reached while liblustre testing Details : ulnds/socklnd must close open socket after unsuccessful diff --git a/lnet/klnds/iiblnd/iiblnd.c b/lnet/klnds/iiblnd/iiblnd.c index 31bcfc6..6e1889f 100644 --- a/lnet/klnds/iiblnd/iiblnd.c +++ b/lnet/klnds/iiblnd/iiblnd.c @@ -847,6 +847,10 @@ kibnal_add_persistent_peer (lnet_nid_t nid) write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + /* I'm always called with a reference on kibnal_data.kib_ni + * so shutdown can't have started */ + LASSERT (kibnal_data.kib_listener_cep != NULL); + peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { kibnal_peer_decref (peer); diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c index 3e67548..727ad15 100644 --- a/lnet/klnds/iiblnd/iiblnd_cb.c +++ b/lnet/klnds/iiblnd/iiblnd_cb.c @@ -2507,6 +2507,15 @@ kibnal_accept (kib_conn_t **connp, IB_HANDLE cep, kib_msg_t *msg, int nob) write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + if (kibnal_data.kib_listener_cep == NULL) { /* shutdown started */ + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + kibnal_peer_decref(peer); + kibnal_conn_decref(conn); + kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES); + return -ESHUTDOWN; + } + peer2 = kibnal_find_peer_locked(nid); if (peer2 == NULL) { /* peer table takes my ref on peer */ diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index b9670d32..3f271df 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -1458,6 +1458,9 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) LASSERT (peer->ibp_connecting == 0); peer->ibp_connecting = 1; + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0); + list_add_tail(&tx->tx_list, &peer->ibp_tx_queue); kiblnd_peer_addref(peer); @@ -2302,6 +2305,9 @@ kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob) LASSERT (peer->ibp_accepting == 0); peer->ibp_accepting = 1; + /* I have a ref on ni that prevents it being shutdown */ + LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0); + kiblnd_peer_addref(peer); list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); diff --git a/lnet/klnds/openiblnd/openiblnd.c b/lnet/klnds/openiblnd/openiblnd.c index 3850d7e..8de9e85 100644 --- a/lnet/klnds/openiblnd/openiblnd.c +++ b/lnet/klnds/openiblnd/openiblnd.c @@ -786,6 +786,10 @@ kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port) write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + /* I'm always called with a reference on kibnal_data.kib_ni + * so shutdown can't have started */ + LASSERT (kibnal_data.kib_nonewpeers == 0); + peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { kibnal_peer_decref(peer); diff --git a/lnet/klnds/openiblnd/openiblnd_cb.c b/lnet/klnds/openiblnd/openiblnd_cb.c index be869e9..9975a91 100644 --- a/lnet/klnds/openiblnd/openiblnd_cb.c +++ b/lnet/klnds/openiblnd/openiblnd_cb.c @@ -1786,6 +1786,16 @@ kibnal_accept_connreq (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + if (kibnal_data.kib_nonewpeers) { + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + CERROR ("Shutdown has started, drop connreq from %s\n", + libcfs_nid2str(msg->ibm_srcnid)); + kibnal_conn_decref(conn); + kibnal_peer_decref(peer); + return -ESHUTDOWN; + } + /* Check I'm the same instance that gave the connection parameters. * NB If my incarnation changes after this, the peer will get nuked and * we'll spot that when the connection is finally added into the peer's diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c index d0f0bdb..7f0acf0 100644 --- a/lnet/klnds/ptllnd/ptllnd_peer.c +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -90,6 +90,7 @@ kptllnd_get_peer_info(int index, void kptllnd_peer_add_peertable_locked (kptl_peer_t *peer) { + LASSERT (!kptllnd_data.kptl_shutdown); LASSERT (kptllnd_data.kptl_n_active_peers < kptllnd_data.kptl_expected_peers); @@ -1065,7 +1066,19 @@ kptllnd_peer_handle_hello (ptl_process_id_t initiator, } write_lock_irqsave(g_lock, flags); + again: + if (kptllnd_data.kptl_shutdown) { + write_unlock_irqrestore(g_lock, flags); + + CERROR ("Shutdown started, refusing connection from %s\n", + libcfs_id2str(lpid)); + kptllnd_peer_unreserve_buffers(); + kptllnd_peer_decref(new_peer); + kptllnd_tx_decref(hello_tx); + return NULL; + } + peer = kptllnd_id2peer_locked(lpid); if (peer != NULL) { if (peer->peer_state == PEER_STATE_WAITING_HELLO) { @@ -1211,6 +1224,12 @@ kptllnd_find_target(kptl_peer_t **peerp, lnet_process_id_t target) write_lock_irqsave(g_lock, flags); again: + if (kptllnd_data.kptl_shutdown) { + write_unlock_irqrestore(g_lock, flags); + rc = -ESHUTDOWN; + goto unwind_2; + } + *peerp = kptllnd_id2peer_locked(target); if (*peerp != NULL) { write_unlock_irqrestore(g_lock, flags); diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 07bbf95..66410e9 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -456,6 +456,9 @@ ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port) write_lock_bh (&ksocknal_data.ksnd_global_lock); + /* always called with a ref on ni, so shutdown can't have started */ + LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); + peer2 = ksocknal_find_peer_locked (ni, id); if (peer2 != NULL) { ksocknal_peer_decref(peer); @@ -1115,6 +1118,9 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, write_lock_bh (global_lock); + /* called with a ref on ni, so shutdown can't have started */ + LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); + peer2 = ksocknal_find_peer_locked(ni, peerid); if (peer2 == NULL) { /* NB this puts an "empty" peer in the peer @@ -2306,6 +2312,60 @@ ksocknal_base_startup (void) } void +ksocknal_debug_peerhash (lnet_ni_t *ni) +{ + ksock_peer_t *peer = NULL; + struct list_head *tmp; + int i; + + read_lock (&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry (tmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni == ni) break; + + peer = NULL; + } + } + + if (peer != NULL) { + ksock_route_t *route; + ksock_conn_t *conn; + + CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, " + "closing %d, accepting %d, err %d, zcookie "LPU64", " + "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id), + atomic_read(&peer->ksnp_refcount), + peer->ksnp_sharecount, peer->ksnp_closing, + peer->ksnp_accepting, peer->ksnp_error, + peer->ksnp_zc_next_cookie, + !list_empty(&peer->ksnp_tx_queue), + !list_empty(&peer->ksnp_zc_req_list)); + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, " + "del %d\n", atomic_read(&route->ksnr_refcount), + route->ksnr_scheduled, route->ksnr_connecting, + route->ksnr_connected, route->ksnr_deleted); + } + + list_for_each (tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + CWARN ("Conn: ref %d, sref %d, t %d, c %d\n", + atomic_read(&conn->ksnc_conn_refcount), + atomic_read(&conn->ksnc_sock_refcount), + conn->ksnc_type, conn->ksnc_closing); + } + } + + read_unlock (&ksocknal_data.ksnd_global_lock); + return; +} + +void ksocknal_shutdown (lnet_ni_t *ni) { ksock_net_t *net = ni->ni_data; @@ -2335,6 +2395,8 @@ ksocknal_shutdown (lnet_ni_t *ni) net->ksnn_npeers); cfs_pause(cfs_time_seconds(1)); + ksocknal_debug_peerhash(ni); + spin_lock_bh (&net->ksnn_lock); } spin_unlock_bh (&net->ksnn_lock); diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c index 10774a4..09d407a 100644 --- a/lnet/klnds/viblnd/viblnd.c +++ b/lnet/klnds/viblnd/viblnd.c @@ -653,6 +653,10 @@ kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip) write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + /* I'm always called with a reference on kibnal_data.kib_ni + * so shutdown can't have started */ + LASSERT (kibnal_data.kib_listen_handle != NULL); + peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { kibnal_peer_decref (peer); diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index a2d25f3..60572b8 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -2188,7 +2188,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) break; } - kibnal_peer_connect_failed(conn->ibc_peer, active, status); + kibnal_peer_connect_failed(peer, active, status); kibnal_conn_disconnected(conn); return; } @@ -2210,8 +2210,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) * peer instance... */ kibnal_conn_addref(conn); /* +1 ref for ibc_list */ list_add(&conn->ibc_list, &peer->ibp_conns); - kibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); + kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation); if (!kibnal_peer_active(peer) || /* peer has been deleted */ conn->ibc_comms_error != 0 || /* comms error */ @@ -2462,6 +2461,16 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) write_lock_irqsave(g_lock, flags); + if (kibnal_data.kib_listen_handle == NULL) { + write_unlock_irqrestore(g_lock, flags); + + CWARN ("Shutdown has started, rejecting connreq from %s\n", + libcfs_nid2str(rxmsg.ibm_srcnid)); + kibnal_peer_decref(peer); + reason = IBNAL_REJECT_FATAL; + goto reject; + } + peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid); if (peer2 != NULL) { /* tie-break connection race in favour of the higher NID */ diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 750deb9..fdfe792 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -485,7 +485,7 @@ lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset LASSERT (!in_interrupt ()); LASSERT (ndiov > 0); - while (doffset > diov->kiov_len) { + while (doffset >= diov->kiov_len) { doffset -= diov->kiov_len; diov++; ndiov--; @@ -493,7 +493,7 @@ lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset } LASSERT (nsiov > 0); - while (soffset > siov->kiov_len) { + while (soffset >= siov->kiov_len) { soffset -= siov->kiov_len; siov++; nsiov--; @@ -565,7 +565,7 @@ lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset LASSERT (!in_interrupt ()); LASSERT (niov > 0); - while (iovoffset > iov->iov_len) { + while (iovoffset >= iov->iov_len) { iovoffset -= iov->iov_len; iov++; niov--; @@ -573,7 +573,7 @@ lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset } LASSERT (nkiov > 0); - while (kiovoffset > kiov->kiov_len) { + while (kiovoffset >= kiov->kiov_len) { kiovoffset -= kiov->kiov_len; kiov++; nkiov--; @@ -634,7 +634,7 @@ lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffs LASSERT (!in_interrupt ()); LASSERT (nkiov > 0); - while (kiovoffset > kiov->kiov_len) { + while (kiovoffset >= kiov->kiov_len) { kiovoffset -= kiov->kiov_len; kiov++; nkiov--; @@ -642,7 +642,7 @@ lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffs } LASSERT (niov > 0); - while (iovoffset > iov->iov_len) { + while (iovoffset >= iov->iov_len) { iovoffset -= iov->iov_len; iov++; niov--; diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 721b641..df1bfc8 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -855,6 +855,8 @@ lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp) int i; LIBCFS_ALLOC(rb, sz); + if (rb == NULL) + return NULL; rb->rb_pool = rbp;