Whamcloud - gitweb
LU-9094 o2iblnd: kill timedout txs from ibp_tx_queue 76/25376/2
authorSergey Cheremencev <sergey.cheremencev@seagate.com>
Tue, 27 Dec 2016 20:29:52 +0000 (23:29 +0300)
committerOleg Drokin <oleg.drokin@intel.com>
Wed, 1 Mar 2017 05:10:47 +0000 (05:10 +0000)
Sometimes connection can't be established for a long time
due to rejections and produces cycle of reconnections.
Peer is not removed in each iteration unlike connection.
Thus until connection becomes established txs live in
peer->ibp_tx_queue. This patch adds tx_deadline checking
for txs from peer tx_queue.

Change-Id: Id2623285c735d1dff40ec755a5c8d20e9c62e60a
Signed-off-by: Sergey Cheremencev <sergey.cheremencev@seagate.com>
Seagate-bug-id: MRP-4056
Reviewed-on: https://review.whamcloud.com/25376
Tested-by: Jenkins
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Amir Shehata <amir.shehata@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 91a4561..309f8c3 100644 (file)
@@ -3147,21 +3147,33 @@ kiblnd_check_conns (int idx)
 {
        struct list_head  closes = LIST_HEAD_INIT(closes);
        struct list_head  checksends = LIST_HEAD_INIT(checksends);
+       struct list_head  timedout_txs = LIST_HEAD_INIT(timedout_txs);
        struct list_head *peers = &kiblnd_data.kib_peers[idx];
        struct list_head *ptmp;
        kib_peer_ni_t    *peer_ni;
        kib_conn_t       *conn;
+       kib_tx_t         *tx, *tx_tmp;
        struct list_head *ctmp;
        unsigned long     flags;
 
        /* NB. We expect to have a look at all the peers and not find any
         * RDMAs to time out, so we just use a shared lock while we
         * take a look... */
-       read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+       write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
 
        list_for_each(ptmp, peers) {
                peer_ni = list_entry(ptmp, kib_peer_ni_t, ibp_list);
 
+               /* Check tx_deadline */
+               list_for_each_entry_safe(tx, tx_tmp, &peer_ni->ibp_tx_queue, tx_list) {
+                       if (cfs_time_aftereq(jiffies, tx->tx_deadline)) {
+                               CWARN("Timed out tx for %s: %lu seconds\n",
+                                     libcfs_nid2str(peer_ni->ibp_nid),
+                                     cfs_duration_sec(jiffies - tx->tx_deadline));
+                               list_move(&tx->tx_list, &timedout_txs);
+                       }
+               }
+
                list_for_each(ctmp, &peer_ni->ibp_conns) {
                        int timedout;
                        int sendnoop;
@@ -3199,7 +3211,10 @@ kiblnd_check_conns (int idx)
                }
        }
 
-       read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+       write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+       if (!list_empty(&timedout_txs))
+               kiblnd_txlist_done(&timedout_txs, -ETIMEDOUT);
 
        /* Handle timeout by closing the whole
         * connection. We can only be sure RDMA activity