Whamcloud - gitweb
LU-16191 socklnd: limit retries on conns_per_peer mismatch
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Mon, 26 Sep 2022 23:47:24 +0000 (16:47 -0700)
committerAndreas Dilger <adilger@whamcloud.com>
Wed, 8 Feb 2023 05:48:37 +0000 (05:48 +0000)
If connection initiator has a higher conns-per-peer setting than
its peer, don't try to create extra connections forever as the
peer will keep rejecting them. A few retries should suffice to
resolve a valid race.

Lustre-change: https://review.whamcloud.com/48664
Lustre-commit: da893c6c9707ca3b2e7532d05f754fccf1cffc74

Test-Parameters: trivial
Fixes: 71b2476e ("LU-12815 socklnd: add conns_per_peer parameter")
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I7d04d4ac41e98a738b6c85c3d323608038f5c51e
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49914
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c

index 9097245..2bca5b5 100644 (file)
@@ -130,6 +130,7 @@ ksocknal_create_conn_cb(__u32 ipaddr, int port)
        conn_cb->ksnr_blki_conn_count = 0;
        conn_cb->ksnr_blko_conn_count = 0;
        conn_cb->ksnr_max_conns = 0;
+       conn_cb->ksnr_busy_retry_count = 0;
 
        return conn_cb;
 }
index 6072122..9bb361c 100644 (file)
@@ -365,6 +365,7 @@ struct ksock_conn {
 };
 
 #define SOCKNAL_CONN_COUNT_MAX_BITS    8       /* max conn count bits */
+#define SOCKNAL_MAX_BUSY_RETRIES       3
 
 struct ksock_conn_cb {
        struct list_head        ksnr_connd_list;/* chain on ksnr_connd_routes */
@@ -388,6 +389,9 @@ struct ksock_conn_cb {
        unsigned int            ksnr_max_conns; /* conns_per_peer at peer
                                                 * creation
                                                 */
+       unsigned int            ksnr_busy_retry_count;/* counts retry attempts
+                                                      * due to EALREADY rc
+                                                      */
 };
 
 #define SOCKNAL_KEEPALIVE_PING          1       /* cookie for keepalive ping */
index b2c3537..002b604 100644 (file)
@@ -1898,7 +1898,7 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb)
 {
        LIST_HEAD(zombies);
        struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer;
-       int type;
+       int type = SOCKLND_CONN_NONE;
        int wanted;
        struct socket *sock;
        time64_t deadline;
@@ -1977,12 +1977,18 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb)
                        goto failed;
                }
 
+               if (rc == EALREADY && conn_cb->ksnr_conn_count > 0)
+                       conn_cb->ksnr_busy_retry_count += 1;
+               else
+                       conn_cb->ksnr_busy_retry_count = 0;
+
                /* A +ve RC means I have to retry because I lost the connection
                 * race or I have to renegotiate protocol version */
                retry_later = (rc != 0);
+
                if (retry_later)
-                       CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n",
-                              libcfs_nid2str(peer_ni->ksnp_id.nid));
+                       CDEBUG(D_NET, "peer_ni %s: conn race, retry later. rc %d\n",
+                              libcfs_nid2str(peer_ni->ksnp_id.nid), rc);
 
                write_lock_bh(&ksocknal_data.ksnd_global_lock);
        }
@@ -1990,9 +1996,19 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb)
        conn_cb->ksnr_scheduled = 0;
        conn_cb->ksnr_connecting = 0;
 
-        if (retry_later) {
-                /* re-queue for attention; this frees me up to handle
-                 * the peer_ni's incoming connection request */
+       if (conn_cb->ksnr_busy_retry_count >= SOCKNAL_MAX_BUSY_RETRIES &&
+           type > SOCKLND_CONN_NONE) {
+               /* After so many retries due to EALREADY assume that
+                * the peer doesn't support as many connections as we want
+                */
+               conn_cb->ksnr_connected |= BIT(type);
+               retry_later = false;
+       }
+
+       if (retry_later) {
+               /* re-queue for attention; this frees me up to handle
+                * the peer_ni's incoming connection request
+                */
 
                if (rc == EALREADY ||
                    (rc == 0 && peer_ni->ksnp_accepting > 0)) {