From: Liang Zhen Date: Tue, 6 Apr 2010 08:48:51 +0000 (+0200) Subject: b=20615 print more debug info for timedout ZC-req X-Git-Tag: 1.8.3~16 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=7012890e3952307c660f9b8957c937c5354dec93;p=fs%2Flustre-release.git b=20615 print more debug info for timedout ZC-req i=maxim i=isaac 1. output more information for timedout ZC-req and partial received connection 2. close connection for timedout ZC-req 3. always send ZC_ACK on non-blocking connection(BULK_IN) --- diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 573e790..84d7ddc 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1550,6 +1550,8 @@ ksocknal_finalize_zcreq(ksock_conn_t *conn) LASSERT (tx->tx_msg.ksm_zc_cookies[0] != 0); tx->tx_msg.ksm_zc_cookies[0] = 0; + if (tx->tx_resid == 0) + tx->tx_resid = -1; /* mark it as not-acked */ list_del(&tx->tx_zc_list); list_add(&tx->tx_zc_list, &zlist); } @@ -1642,6 +1644,8 @@ ksocknal_queue_zombie_conn (ksock_conn_t *conn) void ksocknal_destroy_conn (ksock_conn_t *conn) { + cfs_time_t last_rcv; + /* Final coup-de-grace of the reaper */ CDEBUG (D_NET, "connection %p\n", conn); @@ -1656,10 +1660,16 @@ ksocknal_destroy_conn (ksock_conn_t *conn) /* complete current receive if any */ switch (conn->ksnc_rx_state) { case SOCKNAL_RX_LNET_PAYLOAD: - CERROR("Completing partial receive from %s" - ", ip %d.%d.%d.%d:%d, with error\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + last_rcv = conn->ksnc_rx_deadline - + cfs_time_seconds(*ksocknal_tunables.ksnd_timeout); + CERROR("Completing partial receive from %s[%d]" + ", ip %d.%d.%d.%d:%d, with error, wanted: %d, left: %d, " + "last alive is %ld secs ago\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, + conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left, + cfs_duration_sec(cfs_time_sub(cfs_time_current(), + last_rcv))); lnet_finalize (conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, -EIO); break; diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 5c9b1b5..6a61b96 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -120,6 +120,7 @@ typedef struct int *ksnd_peertimeout; /* seconds to consider peer dead */ int *ksnd_enable_csum; /* enable check sum */ int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */ + int *ksnd_nonblk_zcack; /* always send zc-ack on non-blocking connection */ unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload size */ int *ksnd_zc_recv; /* enable ZC receive (for Chelsio TOE) */ int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */ diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 14b451f..f4b0174 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1270,8 +1270,9 @@ ksocknal_process_receive (ksock_conn_t *conn) id = &conn->ksnc_peer->ksnp_id; rc = conn->ksnc_proto->pro_handle_zcreq(conn, - conn->ksnc_msg.ksm_zc_cookies[0], - le64_to_cpu(lhdr->src_nid) != id->nid); + conn->ksnc_msg.ksm_zc_cookies[0], + *ksocknal_tunables.ksnd_nonblk_zcack || + le64_to_cpu(lhdr->src_nid) != id->nid); } lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc); @@ -2335,6 +2336,7 @@ ksocknal_check_peer_timeouts (int idx) struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; ksock_peer_t *peer; ksock_conn_t *conn; + ksock_tx_t *tx; again: /* NB. We expect to have a look at all the peers and not find any @@ -2343,6 +2345,10 @@ ksocknal_check_peer_timeouts (int idx) cfs_read_lock (&ksocknal_data.ksnd_global_lock); list_for_each_entry(peer, peers, ksnp_list) { + cfs_time_t deadline = 0; + int resid = 0; + int n = 0; + if (ksocknal_send_keepalive_locked(peer) != 0) { read_unlock (&ksocknal_data.ksnd_global_lock); goto again; @@ -2380,29 +2386,45 @@ ksocknal_check_peer_timeouts (int idx) goto again; } } - } - /* print out warnings about stale ZC_REQs */ - list_for_each_entry(peer, peers, ksnp_list) { - ksock_tx_t *tx; - int n = 0; + if (list_empty(&peer->ksnp_zc_req_list)) + continue; + cfs_spin_lock(&peer->ksnp_lock); list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) { if (!cfs_time_aftereq(cfs_time_current(), tx->tx_deadline)) break; + /* ignore the TX if connection is being closed */ + if (tx->tx_conn->ksnc_closing) + continue; n++; } - - if (n != 0) { - tx = list_entry (peer->ksnp_zc_req_list.next, - ksock_tx_t, tx_zc_list); - CWARN("Stale ZC_REQs for peer %s detected: %d; the " - "oldest (%p) timed out %ld secs ago\n", - libcfs_nid2str(peer->ksnp_id.nid), n, tx, - cfs_duration_sec(cfs_time_current() - - tx->tx_deadline)); + if (n == 0) { + cfs_spin_unlock(&peer->ksnp_lock); + continue; } + + tx = list_entry(peer->ksnp_zc_req_list.next, + ksock_tx_t, tx_zc_list); + deadline = tx->tx_deadline; + resid = tx->tx_resid; + conn = tx->tx_conn; + ksocknal_conn_addref(conn); + + cfs_spin_unlock(&peer->ksnp_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); + + CERROR("Total %d stale ZC_REQs for peer %s detected; the " + "oldest(%p) timed out %ld secs ago, " + "resid: %d, wmem: %d\n", + n, libcfs_nid2str(peer->ksnp_id.nid), tx, + cfs_duration_sec(cfs_time_current() - deadline), + resid, libcfs_sock_wmem_queued(conn->ksnc_sock)); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + ksocknal_conn_decref(conn); + goto again; } cfs_read_unlock (&ksocknal_data.ksnd_global_lock); diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c index c4a6261..853c08e 100644 --- a/lnet/klnds/socklnd/socklnd_modparams.c +++ b/lnet/klnds/socklnd/socklnd_modparams.c @@ -129,6 +129,10 @@ CFS_MODULE_PARM(enable_irq_affinity, "i", int, 0644, "enable IRQ affinity"); #endif +static int nonblk_zcack = 1; +CFS_MODULE_PARM(nonblk_zcack, "i", int, 0644, + "always send ZC-ACK on non-blocking connection"); + static unsigned int zc_min_payload = (16 << 10); CFS_MODULE_PARM(zc_min_payload, "i", int, 0644, "minimum payload size to zero copy"); @@ -179,6 +183,7 @@ ksock_tunables_t ksocknal_tunables = { .ksnd_keepalive_intvl = &keepalive_intvl, .ksnd_enable_csum = &enable_csum, .ksnd_inject_csum_error = &inject_csum_error, + .ksnd_nonblk_zcack = &nonblk_zcack, .ksnd_zc_min_payload = &zc_min_payload, .ksnd_zc_recv = &zc_recv, .ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags,