Whamcloud - gitweb
LU-1742 o2iblnd: 'Timed out tx' error message 35/33235/9
authorSonia Sharma <sharmaso@whamcloud.com>
Thu, 6 Sep 2018 03:39:23 +0000 (23:39 -0400)
committerOleg Drokin <green@whamcloud.com>
Wed, 10 Jun 2020 20:51:59 +0000 (20:51 +0000)
Fix the error message in kiblnd_check_txs_locked()
to report the total RDMA time outstanding rather
than the number of seconds past the deadline.

This patch also adds time_on_activeq to struct kib_tx
so the time spent by tx in internal queue and active
queue can be tracked and reported. This would help
in diagnosing the issue.

Change-Id: I4e486389220e383af88dbc482646e92a85bd5b14
Test-Parameters: trivial
Signed-off-by: Sonia Sharma <sharmaso@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/33235
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Stephen Champion <stephen.champion@hpe.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/o2iblnd/o2iblnd.h
lnet/klnds/o2iblnd/o2iblnd_cb.c

index 3793817..9bb0b60 100644 (file)
@@ -636,6 +636,8 @@ struct kib_tx {                                     /* transmit message */
        struct kib_fmr          tx_fmr;
                                /* dma direction */
        int                     tx_dmadir;
+       /* time when tx added on ibc_active_txs */
+       ktime_t                 tx_on_activeq;
 };
 
 struct kib_connvars {
index 0117d40..b2d0427 100644 (file)
@@ -873,15 +873,16 @@ __must_hold(&conn->ibc_lock)
         * from the first send; hence the ++ rather than = below. */
        tx->tx_sending++;
        list_add(&tx->tx_list, &conn->ibc_active_txs);
-
-        /* I'm still holding ibc_lock! */
-        if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
-                rc = -ECONNABORTED;
-        } else if (tx->tx_pool->tpo_pool.po_failed ||
-                 conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
-                /* close_conn will launch failover */
-                rc = -ENETDOWN;
-        } else {
+       tx->tx_on_activeq = ktime_get();
+
+       /* I'm still holding ibc_lock! */
+       if (conn->ibc_state != IBLND_CONN_ESTABLISHED) {
+               rc = -ECONNABORTED;
+       } else if (tx->tx_pool->tpo_pool.po_failed ||
+               conn->ibc_hdev != tx->tx_pool->tpo_hdev) {
+               /* close_conn will launch failover */
+               rc = -ENETDOWN;
+       } else {
                struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd;
                struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
                struct ib_send_wr *wr  = &tx->tx_wrq[0].wr;
@@ -3226,6 +3227,8 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
 {
        struct kib_tx *tx;
        struct list_head *ttmp;
+       bool active_txs = strcmp(kiblnd_queue2str(conn, txs),
+                                "active_txs") == 0;
 
        list_for_each(ttmp, txs) {
                tx = list_entry(ttmp, struct kib_tx, tx_list);
@@ -3237,13 +3240,31 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs)
                        LASSERT(tx->tx_waiting || tx->tx_sending != 0);
                }
 
-               if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) {
-                       CERROR("Timed out tx: %s, %lld seconds\n",
+               if (ktime_compare(ktime_get(), tx->tx_deadline) < 0)
+                       continue;
+
+               if (!active_txs) {
+                       CERROR("Timed out tx: %s, "
+                              "outstanding RDMA time: %lld sec\n",
+                              kiblnd_queue2str(conn, txs),
+                              *kiblnd_tunables.kib_timeout +
+                              (ktime_ms_delta(ktime_get(),
+                                              tx->tx_deadline) / MSEC_PER_SEC));
+               } else {
+                       CERROR("Timed out tx: %s, time in internal queue: %lld "
+                              "sec, time in active queue: %lld sec,"
+                              " outstanding RDMA time: %lld sec\n",
                               kiblnd_queue2str(conn, txs),
+                              ktime_ms_delta(tx->tx_deadline,
+                                             tx->tx_on_activeq) / MSEC_PER_SEC,
                               ktime_ms_delta(ktime_get(),
-                                             tx->tx_deadline) / MSEC_PER_SEC);
-                       return 1;
+                                             tx->tx_on_activeq) / MSEC_PER_SEC,
+                              *kiblnd_tunables.kib_timeout +
+                              (ktime_ms_delta(ktime_get(),
+                                              tx->tx_deadline) / MSEC_PER_SEC));
                }
+
+               return 1;
        }
 
        return 0;