From 824120da92fe8feb4b4308a136e33ec65fe3b635 Mon Sep 17 00:00:00 2001 From: Sergey Cheremencev Date: Tue, 27 Dec 2016 23:29:52 +0300 Subject: [PATCH] LU-9094 o2iblnd: kill timedout txs from ibp_tx_queue Sometimes connection can't be established for a long time due to rejections and produces cycle of reconnections. Peer is not removed in each iteration unlike connection. Thus until connection becomes established txs live in peer->ibp_tx_queue. This patch adds tx_deadline checking for txs from peer tx_queue. Change-Id: Id2623285c735d1dff40ec755a5c8d20e9c62e60a Signed-off-by: Sergey Cheremencev Seagate-bug-id: MRP-4056 Reviewed-on: https://review.whamcloud.com/25376 Tested-by: Jenkins Reviewed-by: Doug Oucharek Reviewed-by: Amir Shehata Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/klnds/o2iblnd/o2iblnd_cb.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 91a4561..309f8c3 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -3147,21 +3147,33 @@ kiblnd_check_conns (int idx) { struct list_head closes = LIST_HEAD_INIT(closes); struct list_head checksends = LIST_HEAD_INIT(checksends); + struct list_head timedout_txs = LIST_HEAD_INIT(timedout_txs); struct list_head *peers = &kiblnd_data.kib_peers[idx]; struct list_head *ptmp; kib_peer_ni_t *peer_ni; kib_conn_t *conn; + kib_tx_t *tx, *tx_tmp; struct list_head *ctmp; unsigned long flags; /* NB. We expect to have a look at all the peers and not find any * RDMAs to time out, so we just use a shared lock while we * take a look... */ - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); list_for_each(ptmp, peers) { peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list); + /* Check tx_deadline */ + list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) { + if (cfs_time_aftereq(jiffies, tx->tx_deadline)) { + CWARN("Timed out tx for %s: %lu seconds\n", + libcfs_nid2str(peer_ni->ibp_nid), + cfs_duration_sec(jiffies - tx->tx_deadline)); + list_move(&tx->tx_list, &timedout_txs); + } + } + list_for_each(ctmp, &peer_ni->ibp_conns) { int timedout; int sendnoop; @@ -3199,7 +3211,10 @@ kiblnd_check_conns (int idx) } } - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (!list_empty(&timedout_txs)) + kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT); /* Handle timeout by closing the whole * connection. We can only be sure RDMA activity -- 1.8.3.1