From: Sonia Sharma Date: Thu, 6 Sep 2018 03:39:23 +0000 (-0400) Subject: LU-1742 o2iblnd: 'Timed out tx' error message X-Git-Tag: 2.13.55~147 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=7308662efc02fde077216f54728ecf278f31311b;ds=sidebyside LU-1742 o2iblnd: 'Timed out tx' error message Fix the error message in kiblnd_check_txs_locked() to report the total RDMA time outstanding rather than the number of seconds past the deadline. This patch also adds time_on_activeq to struct kib_tx so the time spent by tx in internal queue and active queue can be tracked and reported. This would help in diagnosing the issue. Change-Id: I4e486389220e383af88dbc482646e92a85bd5b14 Test-Parameters: trivial Signed-off-by: Sonia Sharma Reviewed-on: https://review.whamcloud.com/33235 Reviewed-by: Andreas Dilger Reviewed-by: Stephen Champion Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 3793817..9bb0b60 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -636,6 +636,8 @@ struct kib_tx { /* transmit message */ struct kib_fmr tx_fmr; /* dma direction */ int tx_dmadir; + /* time when tx added on ibc_active_txs */ + ktime_t tx_on_activeq; }; struct kib_connvars { diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 0117d40..b2d0427 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -873,15 +873,16 @@ __must_hold(&conn->ibc_lock) * from the first send; hence the ++ rather than = below. */ tx->tx_sending++; list_add(&tx->tx_list, &conn->ibc_active_txs); - - /* I'm still holding ibc_lock! */ - if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { - rc = -ECONNABORTED; - } else if (tx->tx_pool->tpo_pool.po_failed || - conn->ibc_hdev != tx->tx_pool->tpo_hdev) { - /* close_conn will launch failover */ - rc = -ENETDOWN; - } else { + tx->tx_on_activeq = ktime_get(); + + /* I'm still holding ibc_lock! */ + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { + rc = -ECONNABORTED; + } else if (tx->tx_pool->tpo_pool.po_failed || + conn->ibc_hdev != tx->tx_pool->tpo_hdev) { + /* close_conn will launch failover */ + rc = -ENETDOWN; + } else { struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd; struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr; struct ib_send_wr *wr = &tx->tx_wrq[0].wr; @@ -3226,6 +3227,8 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) { struct kib_tx *tx; struct list_head *ttmp; + bool active_txs = strcmp(kiblnd_queue2str(conn, txs), + "active_txs") == 0; list_for_each(ttmp, txs) { tx = list_entry(ttmp, struct kib_tx, tx_list); @@ -3237,13 +3240,31 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) LASSERT(tx->tx_waiting || tx->tx_sending != 0); } - if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { - CERROR("Timed out tx: %s, %lld seconds\n", + if (ktime_compare(ktime_get(), tx->tx_deadline) < 0) + continue; + + if (!active_txs) { + CERROR("Timed out tx: %s, " + "outstanding RDMA time: %lld sec\n", + kiblnd_queue2str(conn, txs), + *kiblnd_tunables.kib_timeout + + (ktime_ms_delta(ktime_get(), + tx->tx_deadline) / MSEC_PER_SEC)); + } else { + CERROR("Timed out tx: %s, time in internal queue: %lld " + "sec, time in active queue: %lld sec," + " outstanding RDMA time: %lld sec\n", kiblnd_queue2str(conn, txs), + ktime_ms_delta(tx->tx_deadline, + tx->tx_on_activeq) / MSEC_PER_SEC, ktime_ms_delta(ktime_get(), - tx->tx_deadline) / MSEC_PER_SEC); - return 1; + tx->tx_on_activeq) / MSEC_PER_SEC, + *kiblnd_tunables.kib_timeout + + (ktime_ms_delta(ktime_get(), + tx->tx_deadline) / MSEC_PER_SEC)); } + + return 1; } return 0;