From 02caf7170762d97dac4f367651addc7d90b6eb32 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Wed, 7 Feb 2024 10:48:08 -0800 Subject: [PATCH] LU-17258 socklnd: stop connecting on too many retries If peer repeatedly rejects connection requests with EALREADY, assume that it doesn't support as many connections as we're trying to create. Make sure to stop connecting to the peer altogether and either continue with already created connections if there's at least one of each type, or fail. This helps avoid the assertion: "ASSERTION( (wanted & ((((1UL))) << (3))) != 0 ) failed" Test-Parameters: trivial testlist=sanity-lnet Fixes: 5afe3b053 ("LU-17258 socklnd: ensure connection type established upon race") Signed-off-by: Serguei Smirnov Change-Id: I6072e91cc36544fc2f56c91cd78f6637cf82ecbc Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53955 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Frank Sehr Reviewed-by: Andreas Dilger Reviewed-by: Cyril Bordage Reviewed-by: Oleg Drokin --- lnet/klnds/socklnd/socklnd.c | 6 +++--- lnet/klnds/socklnd/socklnd_cb.c | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 10cf56c..9d7ad25 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -450,16 +450,16 @@ ksocknal_decr_conn_count(struct ksock_conn_cb *conn_cb, break; case SOCKLND_CONN_BULK_IN: conn_cb->ksnr_blki_conn_count--; - if (conn_cb->ksnr_blki_conn_count < conn_cb->ksnr_max_conns) + if (conn_cb->ksnr_blki_conn_count == 0) conn_cb->ksnr_connected &= ~BIT(type); break; case SOCKLND_CONN_BULK_OUT: conn_cb->ksnr_blko_conn_count--; - if (conn_cb->ksnr_blko_conn_count < conn_cb->ksnr_max_conns) + if (conn_cb->ksnr_blko_conn_count == 0) conn_cb->ksnr_connected &= ~BIT(type); break; case SOCKLND_CONN_ANY: - if (conn_cb->ksnr_conn_count < conn_cb->ksnr_max_conns) + if (conn_cb->ksnr_conn_count == 0) conn_cb->ksnr_connected &= ~BIT(type); break; default: diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 30e4771..1e47816 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -2043,7 +2043,20 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) /* After so many retries due to EALREADY assume that * the peer doesn't support as many connections as we want */ - conn_cb->ksnr_connected |= BIT(type); + if (conn_cb->ksnr_blki_conn_count && + conn_cb->ksnr_blko_conn_count && + conn_cb->ksnr_ctrl_conn_count) { + /* Don't create any more connections of any type */ + conn_cb->ksnr_connected |= (BIT(SOCKLND_CONN_CONTROL) | + BIT(SOCKLND_CONN_BULK_IN) | + BIT(SOCKLND_CONN_BULK_OUT)); + } else { + /* If don't have at least one connection of each + * type, fail + */ + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + goto failed; + } retry_later = false; } -- 1.8.3.1