From a657f3ab167bb6cf85b85141e1e9c034663c5b55 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Wed, 7 Feb 2024 10:48:08 -0800 Subject: [PATCH] LU-17258 socklnd: stop connecting on too many retries If peer repeatedly rejects connection requests with EALREADY, assume that it doesn't support as many connections as we're trying to create. Make sure to stop connecting to the peer altogether and either continue with already created connections if there's at least one of each type, or fail. This helps avoid the assertion: "ASSERTION( (wanted & ((((1UL))) << (3))) != 0 ) failed" Lustre-change: https://review.whamcloud.com/53955 Lustre-commit: 02caf7170762d97dac4f367651addc7d90b6eb32 Test-Parameters: trivial testlist=sanity-lnet Fixes: 5afe3b053 ("LU-17258 socklnd: ensure connection type established upon race") Signed-off-by: Serguei Smirnov Change-Id: I6072e91cc36544fc2f56c91cd78f6637cf82ecbc Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/54014 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Frank Sehr Reviewed-by: Cyril Bordage Reviewed-by: Andreas Dilger --- lnet/klnds/socklnd/socklnd.c | 6 +++--- lnet/klnds/socklnd/socklnd_cb.c | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index d00731b..b3e2338 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -414,16 +414,16 @@ ksocknal_decr_conn_count(struct ksock_conn_cb *conn_cb, break; case SOCKLND_CONN_BULK_IN: conn_cb->ksnr_blki_conn_count--; - if (conn_cb->ksnr_blki_conn_count < conn_cb->ksnr_max_conns) + if (conn_cb->ksnr_blki_conn_count == 0) conn_cb->ksnr_connected &= ~BIT(type); break; case SOCKLND_CONN_BULK_OUT: conn_cb->ksnr_blko_conn_count--; - if (conn_cb->ksnr_blko_conn_count < conn_cb->ksnr_max_conns) + if (conn_cb->ksnr_blko_conn_count == 0) conn_cb->ksnr_connected &= ~BIT(type); break; case SOCKLND_CONN_ANY: - if (conn_cb->ksnr_conn_count < conn_cb->ksnr_max_conns) + if (conn_cb->ksnr_conn_count == 0) conn_cb->ksnr_connected &= ~BIT(type); break; default: diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index b3d7b00..d2a83a2 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -2024,8 +2024,22 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) /* After so many retries due to EALREADY assume that * the peer doesn't support as many connections as we want */ - conn_cb->ksnr_connected |= BIT(type); conn_cb->ksnr_max_retries |= BIT(type); + + if (conn_cb->ksnr_blki_conn_count && + conn_cb->ksnr_blko_conn_count && + conn_cb->ksnr_ctrl_conn_count) { + /* Don't create any more connections of any type */ + conn_cb->ksnr_connected |= (BIT(SOCKLND_CONN_CONTROL) | + BIT(SOCKLND_CONN_BULK_IN) | + BIT(SOCKLND_CONN_BULK_OUT)); + } else { + /* If don't have at least one connection of each + * type, fail + */ + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + goto failed; + } retry_later = false; } -- 1.8.3.1