From 2417dec0362fd54a80b83705e584c6f635749796 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Mon, 5 Feb 2024 15:27:15 -0800 Subject: [PATCH] LU-17505 socklnd: return NETWORK_TIMEOUT to LNet on ETIMEOUT Returning LNET_MSG_STATUS_LOCAL_TIMEOUT to LNet on ETIMEDOUT causes LNet to only decrement the local NI health score, while the issue may actually be with the remote NI. Changing this to return LNET_MSG_STATUS_NETWORK_TIMEOUT causes LNet to decrement both local NI and peer NI health. If local NI is ok, it will recover its health score quickly, but the affected peer NI health is lowered until peer NI is recovered. This helps LNet select healthy NIs of the same peer in the meantime. Lustre-change: https://review.whamcloud.com/53930 Lustre-commit: 099350d6e30218eb68d31cbfc7e9252a112e591f Test-Parameters: trivial testlist=sanity-lnet Signed-off-by: Serguei Smirnov Change-Id: I916772477d1fd63571447262880a33830746f002 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53964 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Cyril Bordage Reviewed-by: Frank Sehr Reviewed-by: Andreas Dilger --- lnet/klnds/socklnd/socklnd_cb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 7943f49..b3d7b00 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -439,7 +439,7 @@ ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error) if (tx->tx_hstatus == LNET_MSG_STATUS_OK) { if (error == -ETIMEDOUT) tx->tx_hstatus = - LNET_MSG_STATUS_LOCAL_TIMEOUT; + LNET_MSG_STATUS_NETWORK_TIMEOUT; else if (error == -ENETDOWN || error == -EHOSTUNREACH || error == -ENETUNREACH || @@ -2410,7 +2410,7 @@ ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni) ksocknal_conn_addref(conn); list_for_each_entry(tx, &conn->ksnc_tx_queue, tx_list) - tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT; + tx->tx_hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT; CNETERR("Timeout sending data to %s (%pI4h:%d) " "the network or that node may be down.\n", libcfs_id2str(peer_ni->ksnp_id), @@ -2438,7 +2438,7 @@ ksocknal_flush_stale_txs(struct ksock_peer_ni *peer_ni) if (ktime_get_seconds() < tx->tx_deadline) break; - tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT; + tx->tx_hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT; list_move_tail(&tx->tx_list, &stale_txs); } -- 1.8.3.1