From 7821a3dd38ec5087d8b6e9fecb03e308e1b489a2 Mon Sep 17 00:00:00 2001 From: Serguei Smirnov Date: Mon, 26 Sep 2022 16:47:24 -0700 Subject: [PATCH] LU-16191 socklnd: limit retries on conns_per_peer mismatch If connection initiator has a higher conns-per-peer setting than its peer, don't try to create extra connections forever as the peer will keep rejecting them. A few retries should suffice to resolve a valid race. Lustre-change: https://review.whamcloud.com/48664 Lustre-commit: da893c6c9707ca3b2e7532d05f754fccf1cffc74 Test-Parameters: trivial Fixes: 71b2476e ("LU-12815 socklnd: add conns_per_peer parameter") Signed-off-by: Serguei Smirnov Change-Id: I7d04d4ac41e98a738b6c85c3d323608038f5c51e Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49914 Reviewed-by: Frank Sehr Reviewed-by: Andreas Dilger Reviewed-by: Cyril Bordage Tested-by: jenkins Tested-by: Maloo --- lnet/klnds/socklnd/socklnd.c | 1 + lnet/klnds/socklnd/socklnd.h | 4 ++++ lnet/klnds/socklnd/socklnd_cb.c | 28 ++++++++++++++++++++++------ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 9097245..2bca5b5 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -130,6 +130,7 @@ ksocknal_create_conn_cb(__u32 ipaddr, int port) conn_cb->ksnr_blki_conn_count = 0; conn_cb->ksnr_blko_conn_count = 0; conn_cb->ksnr_max_conns = 0; + conn_cb->ksnr_busy_retry_count = 0; return conn_cb; } diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 6072122..9bb361c 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -365,6 +365,7 @@ struct ksock_conn { }; #define SOCKNAL_CONN_COUNT_MAX_BITS 8 /* max conn count bits */ +#define SOCKNAL_MAX_BUSY_RETRIES 3 struct ksock_conn_cb { struct list_head ksnr_connd_list;/* chain on ksnr_connd_routes */ @@ -388,6 +389,9 @@ struct ksock_conn_cb { unsigned int ksnr_max_conns; /* conns_per_peer at peer * creation */ + unsigned int ksnr_busy_retry_count;/* counts retry attempts + * due to EALREADY rc + */ }; #define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */ diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index b2c3537..002b604 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1898,7 +1898,7 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) { LIST_HEAD(zombies); struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer; - int type; + int type = SOCKLND_CONN_NONE; int wanted; struct socket *sock; time64_t deadline; @@ -1977,12 +1977,18 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) goto failed; } + if (rc == EALREADY && conn_cb->ksnr_conn_count > 0) + conn_cb->ksnr_busy_retry_count += 1; + else + conn_cb->ksnr_busy_retry_count = 0; + /* A +ve RC means I have to retry because I lost the connection * race or I have to renegotiate protocol version */ retry_later = (rc != 0); + if (retry_later) - CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n", - libcfs_nid2str(peer_ni->ksnp_id.nid)); + CDEBUG(D_NET, "peer_ni %s: conn race, retry later. rc %d\n", + libcfs_nid2str(peer_ni->ksnp_id.nid), rc); write_lock_bh(&ksocknal_data.ksnd_global_lock); } @@ -1990,9 +1996,19 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) conn_cb->ksnr_scheduled = 0; conn_cb->ksnr_connecting = 0; - if (retry_later) { - /* re-queue for attention; this frees me up to handle - * the peer_ni's incoming connection request */ + if (conn_cb->ksnr_busy_retry_count >= SOCKNAL_MAX_BUSY_RETRIES && + type > SOCKLND_CONN_NONE) { + /* After so many retries due to EALREADY assume that + * the peer doesn't support as many connections as we want + */ + conn_cb->ksnr_connected |= BIT(type); + retry_later = false; + } + + if (retry_later) { + /* re-queue for attention; this frees me up to handle + * the peer_ni's incoming connection request + */ if (rc == EALREADY || (rc == 0 && peer_ni->ksnp_accepting > 0)) { -- 1.8.3.1