From da893c6c9707ca3b2e7532d05f754fccf1cffc74 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Mon, 26 Sep 2022 16:47:24 -0700 Subject: [PATCH] LU-16191 socklnd: limit retries on conns_per_peer mismatch If connection initiator has a higher conns-per-peer setting than its peer, don't try to create extra connections forever as the peer will keep rejecting them. A few retries should suffice to resolve a valid race. Test-Parameters: trivial Fixes: 71b2476e ("LU-12815 socklnd: add conns_per_peer parameter") Signed-off-by: Serguei Smirnov Change-Id: I7d04d4ac41e98a738b6c85c3d323608038f5c51e Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48664 Reviewed-by: Frank Sehr Reviewed-by: Chris Horn Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- lnet/klnds/socklnd/socklnd.c | 1 + lnet/klnds/socklnd/socklnd.h | 4 ++++ lnet/klnds/socklnd/socklnd_cb.c | 21 ++++++++++++++++++--- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index f650a45..7d991a1 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -144,6 +144,7 @@ ksocknal_create_conn_cb(struct sockaddr *addr) conn_cb->ksnr_blki_conn_count = 0; conn_cb->ksnr_blko_conn_count = 0; conn_cb->ksnr_max_conns = 0; + conn_cb->ksnr_busy_retry_count = 0; return conn_cb; } diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index c92ae3c..c9a9fdb 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -365,6 +365,7 @@ struct ksock_conn { }; #define SOCKNAL_CONN_COUNT_MAX_BITS 8 /* max conn count bits */ +#define SOCKNAL_MAX_BUSY_RETRIES 3 struct ksock_conn_cb { struct list_head ksnr_connd_list;/* chain on ksnr_connd_routes */ @@ -387,6 +388,9 @@ struct ksock_conn_cb { unsigned int ksnr_max_conns; /* conns_per_peer at peer * creation */ + unsigned int ksnr_busy_retry_count;/* counts retry attempts + * due to EALREADY rc + */ }; #define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */ diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index a3b20d1..1ebf327 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1907,7 +1907,7 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) { LIST_HEAD(zombies); struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer; - int type; + int type = SOCKLND_CONN_NONE; int wanted; struct socket *sock; time64_t deadline; @@ -1987,13 +1987,19 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) goto failed; } + if (rc == EALREADY && conn_cb->ksnr_conn_count > 0) + conn_cb->ksnr_busy_retry_count += 1; + else + conn_cb->ksnr_busy_retry_count = 0; + /* A +ve RC means I have to retry because I lost the connection * race or I have to renegotiate protocol version */ retry_later = (rc != 0); + if (retry_later) - CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n", - libcfs_nidstr(&peer_ni->ksnp_id.nid)); + CDEBUG(D_NET, "peer_ni %s: conn race, retry later. rc %d\n", + libcfs_nidstr(&peer_ni->ksnp_id.nid), rc); write_lock_bh(&ksocknal_data.ksnd_global_lock); } @@ -2001,6 +2007,15 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) conn_cb->ksnr_scheduled = 0; conn_cb->ksnr_connecting = 0; + if (conn_cb->ksnr_busy_retry_count >= SOCKNAL_MAX_BUSY_RETRIES && + type > SOCKLND_CONN_NONE) { + /* After so many retries due to EALREADY assume that + * the peer doesn't support as many connections as we want + */ + conn_cb->ksnr_connected |= BIT(type); + retry_later = false; + } + if (retry_later) { /* re-queue for attention; this frees me up to handle * the peer_ni's incoming connection request -- 1.8.3.1