ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
Severity : normal
+Bugzilla : 10778
+Description: kibnal_shutdown() doesn't finish; lconf --cleanup hangs
+Details : races between lnd_shutdown and peer creation prevent
+ lnd_shutdown from finishing.
+
+Severity : normal
Bugzilla : 13279
Description: open files rlimit 1024 reached while liblustre testing
Details : ulnds/socklnd must close open socket after unsuccessful
write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ /* I'm always called with a reference on kibnal_data.kib_ni
+ * so shutdown can't have started */
+ LASSERT (kibnal_data.kib_listener_cep != NULL);
+
peer2 = kibnal_find_peer_locked (nid);
if (peer2 != NULL) {
kibnal_peer_decref (peer);
write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ if (kibnal_data.kib_listener_cep == NULL) { /* shutdown started */
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ kibnal_peer_decref(peer);
+ kibnal_conn_decref(conn);
+ kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES);
+ return -ESHUTDOWN;
+ }
+
peer2 = kibnal_find_peer_locked(nid);
if (peer2 == NULL) {
/* peer table takes my ref on peer */
LASSERT (peer->ibp_connecting == 0);
peer->ibp_connecting = 1;
+ /* always called with a ref on ni, which prevents ni being shutdown */
+ LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
kiblnd_peer_addref(peer);
LASSERT (peer->ibp_accepting == 0);
peer->ibp_accepting = 1;
+ /* I have a ref on ni that prevents it being shutdown */
+ LASSERT (((kib_net_t *)ni->ni_data)->ibn_shutdown == 0);
+
kiblnd_peer_addref(peer);
list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ /* I'm always called with a reference on kibnal_data.kib_ni
+ * so shutdown can't have started */
+ LASSERT (kibnal_data.kib_nonewpeers == 0);
+
peer2 = kibnal_find_peer_locked (nid);
if (peer2 != NULL) {
kibnal_peer_decref(peer);
write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ if (kibnal_data.kib_nonewpeers) {
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ CERROR ("Shutdown has started, drop connreq from %s\n",
+ libcfs_nid2str(msg->ibm_srcnid));
+ kibnal_conn_decref(conn);
+ kibnal_peer_decref(peer);
+ return -ESHUTDOWN;
+ }
+
/* Check I'm the same instance that gave the connection parameters.
* NB If my incarnation changes after this, the peer will get nuked and
* we'll spot that when the connection is finally added into the peer's
void
kptllnd_peer_add_peertable_locked (kptl_peer_t *peer)
{
+ LASSERT (!kptllnd_data.kptl_shutdown);
LASSERT (kptllnd_data.kptl_n_active_peers <
kptllnd_data.kptl_expected_peers);
}
write_lock_irqsave(g_lock, flags);
+
again:
+ if (kptllnd_data.kptl_shutdown) {
+ write_unlock_irqrestore(g_lock, flags);
+
+ CERROR ("Shutdown started, refusing connection from %s\n",
+ libcfs_id2str(lpid));
+ kptllnd_peer_unreserve_buffers();
+ kptllnd_peer_decref(new_peer);
+ kptllnd_tx_decref(hello_tx);
+ return NULL;
+ }
+
peer = kptllnd_id2peer_locked(lpid);
if (peer != NULL) {
if (peer->peer_state == PEER_STATE_WAITING_HELLO) {
write_lock_irqsave(g_lock, flags);
again:
+ if (kptllnd_data.kptl_shutdown) {
+ write_unlock_irqrestore(g_lock, flags);
+ rc = -ESHUTDOWN;
+ goto unwind_2;
+ }
+
*peerp = kptllnd_id2peer_locked(target);
if (*peerp != NULL) {
write_unlock_irqrestore(g_lock, flags);
write_lock_bh (&ksocknal_data.ksnd_global_lock);
+ /* always called with a ref on ni, so shutdown can't have started */
+ LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
peer2 = ksocknal_find_peer_locked (ni, id);
if (peer2 != NULL) {
ksocknal_peer_decref(peer);
write_lock_bh (global_lock);
+ /* called with a ref on ni, so shutdown can't have started */
+ LASSERT (((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0);
+
peer2 = ksocknal_find_peer_locked(ni, peerid);
if (peer2 == NULL) {
/* NB this puts an "empty" peer in the peer
}
void
+ksocknal_debug_peerhash (lnet_ni_t *ni)
+{
+ ksock_peer_t *peer = NULL;
+ struct list_head *tmp;
+ int i;
+
+ read_lock (&ksocknal_data.ksnd_global_lock);
+
+ for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
+ list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+ peer = list_entry (tmp, ksock_peer_t, ksnp_list);
+
+ if (peer->ksnp_ni == ni) break;
+
+ peer = NULL;
+ }
+ }
+
+ if (peer != NULL) {
+ ksock_route_t *route;
+ ksock_conn_t *conn;
+
+ CWARN ("Active peer on shutdown: %s, ref %d, scnt %d, "
+ "closing %d, accepting %d, err %d, zcookie "LPU64", "
+ "txq %d, zc_req %d\n", libcfs_id2str(peer->ksnp_id),
+ atomic_read(&peer->ksnp_refcount),
+ peer->ksnp_sharecount, peer->ksnp_closing,
+ peer->ksnp_accepting, peer->ksnp_error,
+ peer->ksnp_zc_next_cookie,
+ !list_empty(&peer->ksnp_tx_queue),
+ !list_empty(&peer->ksnp_zc_req_list));
+
+ list_for_each (tmp, &peer->ksnp_routes) {
+ route = list_entry(tmp, ksock_route_t, ksnr_list);
+ CWARN ("Route: ref %d, schd %d, conn %d, cnted %d, "
+ "del %d\n", atomic_read(&route->ksnr_refcount),
+ route->ksnr_scheduled, route->ksnr_connecting,
+ route->ksnr_connected, route->ksnr_deleted);
+ }
+
+ list_for_each (tmp, &peer->ksnp_conns) {
+ conn = list_entry(tmp, ksock_conn_t, ksnc_list);
+ CWARN ("Conn: ref %d, sref %d, t %d, c %d\n",
+ atomic_read(&conn->ksnc_conn_refcount),
+ atomic_read(&conn->ksnc_sock_refcount),
+ conn->ksnc_type, conn->ksnc_closing);
+ }
+ }
+
+ read_unlock (&ksocknal_data.ksnd_global_lock);
+ return;
+}
+
+void
ksocknal_shutdown (lnet_ni_t *ni)
{
ksock_net_t *net = ni->ni_data;
net->ksnn_npeers);
cfs_pause(cfs_time_seconds(1));
+ ksocknal_debug_peerhash(ni);
+
spin_lock_bh (&net->ksnn_lock);
}
spin_unlock_bh (&net->ksnn_lock);
write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ /* I'm always called with a reference on kibnal_data.kib_ni
+ * so shutdown can't have started */
+ LASSERT (kibnal_data.kib_listen_handle != NULL);
+
peer2 = kibnal_find_peer_locked (nid);
if (peer2 != NULL) {
kibnal_peer_decref (peer);
break;
}
- kibnal_peer_connect_failed(conn->ibc_peer, active, status);
+ kibnal_peer_connect_failed(peer, active, status);
kibnal_conn_disconnected(conn);
return;
}
* peer instance... */
kibnal_conn_addref(conn); /* +1 ref for ibc_list */
list_add(&conn->ibc_list, &peer->ibp_conns);
- kibnal_close_stale_conns_locked (conn->ibc_peer,
- conn->ibc_incarnation);
+ kibnal_close_stale_conns_locked (peer, conn->ibc_incarnation);
if (!kibnal_peer_active(peer) || /* peer has been deleted */
conn->ibc_comms_error != 0 || /* comms error */
write_lock_irqsave(g_lock, flags);
+ if (kibnal_data.kib_listen_handle == NULL) {
+ write_unlock_irqrestore(g_lock, flags);
+
+ CWARN ("Shutdown has started, rejecting connreq from %s\n",
+ libcfs_nid2str(rxmsg.ibm_srcnid));
+ kibnal_peer_decref(peer);
+ reason = IBNAL_REJECT_FATAL;
+ goto reject;
+ }
+
peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid);
if (peer2 != NULL) {
/* tie-break connection race in favour of the higher NID */
LASSERT (!in_interrupt ());
LASSERT (ndiov > 0);
- while (doffset > diov->kiov_len) {
+ while (doffset >= diov->kiov_len) {
doffset -= diov->kiov_len;
diov++;
ndiov--;
}
LASSERT (nsiov > 0);
- while (soffset > siov->kiov_len) {
+ while (soffset >= siov->kiov_len) {
soffset -= siov->kiov_len;
siov++;
nsiov--;
LASSERT (!in_interrupt ());
LASSERT (niov > 0);
- while (iovoffset > iov->iov_len) {
+ while (iovoffset >= iov->iov_len) {
iovoffset -= iov->iov_len;
iov++;
niov--;
}
LASSERT (nkiov > 0);
- while (kiovoffset > kiov->kiov_len) {
+ while (kiovoffset >= kiov->kiov_len) {
kiovoffset -= kiov->kiov_len;
kiov++;
nkiov--;
LASSERT (!in_interrupt ());
LASSERT (nkiov > 0);
- while (kiovoffset > kiov->kiov_len) {
+ while (kiovoffset >= kiov->kiov_len) {
kiovoffset -= kiov->kiov_len;
kiov++;
nkiov--;
}
LASSERT (niov > 0);
- while (iovoffset > iov->iov_len) {
+ while (iovoffset >= iov->iov_len) {
iovoffset -= iov->iov_len;
iov++;
niov--;
int i;
LIBCFS_ALLOC(rb, sz);
+ if (rb == NULL)
+ return NULL;
rb->rb_pool = rbp;