Whamcloud - gitweb
LU-17258 socklnd: stop connecting on too many retries 55/53955/3
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Wed, 7 Feb 2024 18:48:08 +0000 (10:48 -0800)
committerOleg Drokin <green@whamcloud.com>
Fri, 23 Feb 2024 07:17:06 +0000 (07:17 +0000)
If peer repeatedly rejects connection requests with EALREADY,
assume that it doesn't support as many connections as we're trying
to create. Make sure to stop connecting to the peer altogether and
either continue with already created connections if there's at least
one of each type, or fail.

This helps avoid the assertion:

"ASSERTION( (wanted & ((((1UL))) << (3))) != 0 ) failed"

Test-Parameters: trivial testlist=sanity-lnet
Fixes: 5afe3b053 ("LU-17258 socklnd: ensure connection type established upon race")
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I6072e91cc36544fc2f56c91cd78f6637cf82ecbc
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53955
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd_cb.c

index 10cf56c..9d7ad25 100644 (file)
@@ -450,16 +450,16 @@ ksocknal_decr_conn_count(struct ksock_conn_cb *conn_cb,
                break;
        case SOCKLND_CONN_BULK_IN:
                conn_cb->ksnr_blki_conn_count--;
-               if (conn_cb->ksnr_blki_conn_count < conn_cb->ksnr_max_conns)
+               if (conn_cb->ksnr_blki_conn_count == 0)
                        conn_cb->ksnr_connected &= ~BIT(type);
                break;
        case SOCKLND_CONN_BULK_OUT:
                conn_cb->ksnr_blko_conn_count--;
-               if (conn_cb->ksnr_blko_conn_count < conn_cb->ksnr_max_conns)
+               if (conn_cb->ksnr_blko_conn_count == 0)
                        conn_cb->ksnr_connected &= ~BIT(type);
                break;
        case SOCKLND_CONN_ANY:
-               if (conn_cb->ksnr_conn_count < conn_cb->ksnr_max_conns)
+               if (conn_cb->ksnr_conn_count == 0)
                        conn_cb->ksnr_connected &= ~BIT(type);
                break;
        default:
index 30e4771..1e47816 100644 (file)
@@ -2043,7 +2043,20 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb)
                /* After so many retries due to EALREADY assume that
                 * the peer doesn't support as many connections as we want
                 */
-               conn_cb->ksnr_connected |= BIT(type);
+               if (conn_cb->ksnr_blki_conn_count &&
+                   conn_cb->ksnr_blko_conn_count &&
+                   conn_cb->ksnr_ctrl_conn_count) {
+                       /* Don't create any more connections of any type */
+                       conn_cb->ksnr_connected |= (BIT(SOCKLND_CONN_CONTROL) |
+                                                   BIT(SOCKLND_CONN_BULK_IN) |
+                                                   BIT(SOCKLND_CONN_BULK_OUT));
+               } else {
+                       /* If don't have at least one connection of each
+                        * type, fail
+                        */
+                       write_unlock_bh(&ksocknal_data.ksnd_global_lock);
+                       goto failed;
+               }
                retry_later = false;
        }