From 099350d6e30218eb68d31cbfc7e9252a112e591f Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Mon, 5 Feb 2024 15:27:15 -0800 Subject: [PATCH] LU-17505 socklnd: return NETWORK_TIMEOUT to LNet on ETIMEOUT Returning LNET_MSG_STATUS_LOCAL_TIMEOUT to LNet on ETIMEDOUT causes LNet to only decrement the local NI health score, while the issue may actually be with the remote NI. Changing this to return LNET_MSG_STATUS_NETWORK_TIMEOUT causes LNet to decrement both local NI and peer NI health. If local NI is ok, it will recover its health score quickly, but the affected peer NI health is lowered until peer NI is recovered. This helps LNet select healthy NIs of the same peer in the meantime. Signed-off-by: Serguei Smirnov Change-Id: I916772477d1fd63571447262880a33830746f002 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53930 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Frank Sehr Reviewed-by: Chris Horn Reviewed-by: Cyril Bordage Reviewed-by: Oleg Drokin --- lnet/klnds/socklnd/socklnd_cb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 51e1dba..30e4771 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -437,7 +437,7 @@ ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error) if (tx->tx_hstatus == LNET_MSG_STATUS_OK) { if (error == -ETIMEDOUT) tx->tx_hstatus = - LNET_MSG_STATUS_LOCAL_TIMEOUT; + LNET_MSG_STATUS_NETWORK_TIMEOUT; else if (error == -ENETDOWN || error == -EHOSTUNREACH || error == -ENETUNREACH || @@ -2418,7 +2418,7 @@ ksocknal_find_timed_out_conn(struct ksock_peer_ni *peer_ni) list_for_each_entry(tx, &conn->ksnc_tx_queue, tx_list) tx->tx_hstatus = - LNET_MSG_STATUS_LOCAL_TIMEOUT; + LNET_MSG_STATUS_NETWORK_TIMEOUT; CNETERR("Timeout sending data to %s (%pIScp) the network or that node may be down.\n", libcfs_idstr(&peer_ni->ksnp_id), &conn->ksnc_peeraddr); @@ -2445,7 +2445,7 @@ ksocknal_flush_stale_txs(struct ksock_peer_ni *peer_ni) if (ktime_get_seconds() < tx->tx_deadline) break; - tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_TIMEOUT; + tx->tx_hstatus = LNET_MSG_STATUS_NETWORK_TIMEOUT; list_move_tail(&tx->tx_list, &stale_txs); } -- 1.8.3.1