From e08ac764867a0e36b303f511c94a0fa27e3dd53d Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Wed, 17 Jun 2020 02:19:52 +0000 Subject: [PATCH] LU-13675 o2iblnd: revert 'Timed out tx' patch Revert "LU-1742 o2iblnd: 'Timed out tx' error message" patch as this is causing crashes in o2iblnd consistently. This reverts commit 7308662efc02fde077216f54728ecf278f31311b. Test-Parameters: trivial Change-Id: I470023f41eb1123de92aa3d86b32c7893363bc4e Signed-off-by: Andreas Dilger Reviewed-on: https://review.whamcloud.com/38958 Reviewed-by: Serguei Smirnov Tested-by: Shuichi Ihara Tested-by: jenkins Reviewed-by: Olaf Faaland-LLNL Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/klnds/o2iblnd/o2iblnd.h | 2 -- lnet/klnds/o2iblnd/o2iblnd_cb.c | 47 ++++++++++++----------------------------- 2 files changed, 13 insertions(+), 36 deletions(-) diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 9bb0b60..3793817 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -636,8 +636,6 @@ struct kib_tx { /* transmit message */ struct kib_fmr tx_fmr; /* dma direction */ int tx_dmadir; - /* time when tx added on ibc_active_txs */ - ktime_t tx_on_activeq; }; struct kib_connvars { diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index 0424a135..3b3ede1 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -874,16 +874,15 @@ __must_hold(&conn->ibc_lock) * from the first send; hence the ++ rather than = below. */ tx->tx_sending++; list_add(&tx->tx_list, &conn->ibc_active_txs); - tx->tx_on_activeq = ktime_get(); - - /* I'm still holding ibc_lock! */ - if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { - rc = -ECONNABORTED; - } else if (tx->tx_pool->tpo_pool.po_failed || - conn->ibc_hdev != tx->tx_pool->tpo_hdev) { - /* close_conn will launch failover */ - rc = -ENETDOWN; - } else { + + /* I'm still holding ibc_lock! */ + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { + rc = -ECONNABORTED; + } else if (tx->tx_pool->tpo_pool.po_failed || + conn->ibc_hdev != tx->tx_pool->tpo_hdev) { + /* close_conn will launch failover */ + rc = -ENETDOWN; + } else { struct kib_fast_reg_descriptor *frd = tx->tx_fmr.fmr_frd; struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr; struct ib_send_wr *wr = &tx->tx_wrq[0].wr; @@ -3228,8 +3227,6 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) { struct kib_tx *tx; struct list_head *ttmp; - bool active_txs = strcmp(kiblnd_queue2str(conn, txs), - "active_txs") == 0; list_for_each(ttmp, txs) { tx = list_entry(ttmp, struct kib_tx, tx_list); @@ -3241,31 +3238,13 @@ kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) LASSERT(tx->tx_waiting || tx->tx_sending != 0); } - if (ktime_compare(ktime_get(), tx->tx_deadline) < 0) - continue; - - if (!active_txs) { - CERROR("Timed out tx: %s, " - "outstanding RDMA time: %lld sec\n", - kiblnd_queue2str(conn, txs), - *kiblnd_tunables.kib_timeout + - (ktime_ms_delta(ktime_get(), - tx->tx_deadline) / MSEC_PER_SEC)); - } else { - CERROR("Timed out tx: %s, time in internal queue: %lld " - "sec, time in active queue: %lld sec," - " outstanding RDMA time: %lld sec\n", + if (ktime_compare(ktime_get(), tx->tx_deadline) >= 0) { + CERROR("Timed out tx: %s, %lld seconds\n", kiblnd_queue2str(conn, txs), - ktime_ms_delta(tx->tx_deadline, - tx->tx_on_activeq) / MSEC_PER_SEC, ktime_ms_delta(ktime_get(), - tx->tx_on_activeq) / MSEC_PER_SEC, - *kiblnd_tunables.kib_timeout + - (ktime_ms_delta(ktime_get(), - tx->tx_deadline) / MSEC_PER_SEC)); + tx->tx_deadline) / MSEC_PER_SEC); + return 1; } - - return 1; } return 0; -- 1.8.3.1