Whamcloud - gitweb
LU-16191 socklnd: limit retries on conns_per_peer mismatch 64/48664/3
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Mon, 26 Sep 2022 23:47:24 +0000 (16:47 -0700)
committerOleg Drokin <green@whamcloud.com>
Mon, 10 Oct 2022 05:38:58 +0000 (05:38 +0000)
If connection initiator has a higher conns-per-peer setting than
its peer, don't try to create extra connections forever as the
peer will keep rejecting them. A few retries should suffice to
resolve a valid race.

Test-Parameters: trivial
Fixes: 71b2476e ("LU-12815 socklnd: add conns_per_peer parameter")
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I7d04d4ac41e98a738b6c85c3d323608038f5c51e
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48664
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c

index f650a45..7d991a1 100644 (file)
@@ -144,6 +144,7 @@ ksocknal_create_conn_cb(struct sockaddr *addr)
        conn_cb->ksnr_blki_conn_count = 0;
        conn_cb->ksnr_blko_conn_count = 0;
        conn_cb->ksnr_max_conns = 0;
+       conn_cb->ksnr_busy_retry_count = 0;
 
        return conn_cb;
 }
index c92ae3c..c9a9fdb 100644 (file)
@@ -365,6 +365,7 @@ struct ksock_conn {
 };
 
 #define SOCKNAL_CONN_COUNT_MAX_BITS    8       /* max conn count bits */
+#define SOCKNAL_MAX_BUSY_RETRIES       3
 
 struct ksock_conn_cb {
        struct list_head        ksnr_connd_list;/* chain on ksnr_connd_routes */
@@ -387,6 +388,9 @@ struct ksock_conn_cb {
        unsigned int            ksnr_max_conns; /* conns_per_peer at peer
                                                 * creation
                                                 */
+       unsigned int            ksnr_busy_retry_count;/* counts retry attempts
+                                                      * due to EALREADY rc
+                                                      */
 };
 
 #define SOCKNAL_KEEPALIVE_PING          1       /* cookie for keepalive ping */
index a3b20d1..1ebf327 100644 (file)
@@ -1907,7 +1907,7 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb)
 {
        LIST_HEAD(zombies);
        struct ksock_peer_ni *peer_ni = conn_cb->ksnr_peer;
-       int type;
+       int type = SOCKLND_CONN_NONE;
        int wanted;
        struct socket *sock;
        time64_t deadline;
@@ -1987,13 +1987,19 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb)
                        goto failed;
                }
 
+               if (rc == EALREADY && conn_cb->ksnr_conn_count > 0)
+                       conn_cb->ksnr_busy_retry_count += 1;
+               else
+                       conn_cb->ksnr_busy_retry_count = 0;
+
                /* A +ve RC means I have to retry because I lost the connection
                 * race or I have to renegotiate protocol version
                 */
                retry_later = (rc != 0);
+
                if (retry_later)
-                       CDEBUG(D_NET, "peer_ni %s: conn race, retry later.\n",
-                              libcfs_nidstr(&peer_ni->ksnp_id.nid));
+                       CDEBUG(D_NET, "peer_ni %s: conn race, retry later. rc %d\n",
+                              libcfs_nidstr(&peer_ni->ksnp_id.nid), rc);
 
                write_lock_bh(&ksocknal_data.ksnd_global_lock);
        }
@@ -2001,6 +2007,15 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb)
        conn_cb->ksnr_scheduled = 0;
        conn_cb->ksnr_connecting = 0;
 
+       if (conn_cb->ksnr_busy_retry_count >= SOCKNAL_MAX_BUSY_RETRIES &&
+           type > SOCKLND_CONN_NONE) {
+               /* After so many retries due to EALREADY assume that
+                * the peer doesn't support as many connections as we want
+                */
+               conn_cb->ksnr_connected |= BIT(type);
+               retry_later = false;
+       }
+
        if (retry_later) {
                /* re-queue for attention; this frees me up to handle
                 * the peer_ni's incoming connection request