From f6cd596982ed4380e5547181022ad81e4c6d3512 Mon Sep 17 00:00:00 2001 From: maxim Date: Fri, 5 Sep 2008 14:58:15 +0000 Subject: [PATCH] b=16308 i=isaac i=liang Conf-sanity test_32a couldn't stop ost and mds because it tried to access non-existent peer and tcp connect took quite long before timing out. The patch flushes txs pinned to a peer even if it's still in "connecting" state. --- lnet/ChangeLog | 7 ++++ lnet/klnds/socklnd/socklnd.h | 1 + lnet/klnds/socklnd/socklnd_cb.c | 75 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+) diff --git a/lnet/ChangeLog b/lnet/ChangeLog index 480282c..b331348 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -18,6 +18,13 @@ Description: Details : Severity : normal +Bugzilla : 16308 +Description: finalize network operation in reasonable time +Details : conf-sanity test_32a couldn't stop ost and mds because it + tried to access non-existent peer and tcp connect took + quite long before timing out. + +Severity : normal Bugzilla : 13139 Description: Remove portals compatibility Details : Remove portals compatibility, not interoperable with releases diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 28bcc75..bfda248 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -209,6 +209,7 @@ typedef struct /* transmit packet */ lnet_kiov_t *tx_kiov; /* packet page frags */ struct ksock_conn *tx_conn; /* owning conn */ lnet_msg_t *tx_lnetmsg; /* lnet message for lnet_finalize() */ + cfs_time_t tx_deadline; /* when (in jiffies) tx times out */ ksock_msg_t tx_msg; /* socklnd message buffer */ int tx_desc_size; /* size of this descriptor */ union { diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 318a4b9..efec018 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -463,6 +463,10 @@ ksocknal_check_zc_req(ksock_tx_t *tx) spin_lock(&peer->ksnp_lock); + /* ZC_REQ is going to be pinned to the peer */ + tx->tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + LASSERT (tx->tx_msg.ksm_zc_req_cookie == 0); tx->tx_msg.ksm_zc_req_cookie = peer->ksnp_zc_next_cookie++; list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list); @@ -980,6 +984,10 @@ ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id) if (peer->ksnp_accepting > 0 || ksocknal_find_connecting_route_locked (peer) != NULL) { + /* the message is going to be pinned to the peer */ + tx->tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + /* Queue the message until a connection is established */ list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); write_unlock_bh (g_lock); @@ -2601,6 +2609,31 @@ ksocknal_find_timed_out_conn (ksock_peer_t *peer) return (NULL); } +static inline void +ksocknal_flush_stale_txs(ksock_peer_t *peer) +{ + ksock_tx_t *tx; + CFS_LIST_HEAD (stale_txs); + + write_lock_bh (&ksocknal_data.ksnd_global_lock); + + while (!list_empty (&peer->ksnp_tx_queue)) { + tx = list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + if (!cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) + break; + + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &stale_txs); + } + + write_unlock_bh (&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1); +} + void ksocknal_check_peer_timeouts (int idx) { @@ -2630,8 +2663,50 @@ ksocknal_check_peer_timeouts (int idx) ksocknal_conn_decref(conn); goto again; } + + /* we can't process stale txs right here because we're + * holding only shared lock */ + if (!list_empty (&peer->ksnp_tx_queue)) { + ksock_tx_t *tx = list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + if (cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) { + + ksocknal_peer_addref(peer); + read_unlock (&ksocknal_data.ksnd_global_lock); + + ksocknal_flush_stale_txs(peer); + + ksocknal_peer_decref(peer); + goto again; + } + } } + /* print out warnings about stale ZC_REQs */ + list_for_each_entry(peer, peers, ksnp_list) { + ksock_tx_t *tx; + int n = 0; + + list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) { + if (!cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) + break; + n++; + } + + if (n != 0) { + tx = list_entry (peer->ksnp_zc_req_list.next, + ksock_tx_t, tx_zc_list); + CWARN("Stale ZC_REQs for peer %s detected: %d; the " + "oldest (%p) timed out %ld secs ago\n", + libcfs_nid2str(peer->ksnp_id.nid), n, tx, + cfs_duration_sec(cfs_time_current() - + tx->tx_deadline)); + } + } + read_unlock (&ksocknal_data.ksnd_global_lock); } -- 1.8.3.1