Whamcloud - gitweb
LU-17258 socklnd: ensure connection type established upon race 57/52957/3
authorChris Horn <chris.horn@hpe.com>
Thu, 2 Nov 2023 19:28:45 +0000 (12:28 -0700)
committerOleg Drokin <green@whamcloud.com>
Wed, 8 Nov 2023 22:07:54 +0000 (22:07 +0000)
When a connection race is hit between two peers, only increment the
retry count if a connection of the specific type has already been
established; otherwise, this can lead to an unexpected value set in
ksnr_connected and some of the assertions being triggered in
ksocknal_connect():

"ASSERTION( (wanted & ((((1UL))) << (3))) != 0 ) failed"

Fixes: da893c6c97 ("LU-16191 socklnd: limit retries on conns_per_peer mismatch")
HPE-bug-id: LUS-11922
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Signed-off-by: Nikitas Angelinas <nikitas.angelinas@hpe.com>
Change-Id: I6e8abb39ad3c0bcd7fbc8f8c5478c903029df908
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52957
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c

index 3991dc0..e48a1d7 100644 (file)
@@ -380,33 +380,6 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index,
 }
 
 static unsigned int
-ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb,
-                               int type)
-{
-       unsigned int count = 0;
-
-       switch (type) {
-       case SOCKLND_CONN_CONTROL:
-               count = conn_cb->ksnr_ctrl_conn_count;
-               break;
-       case SOCKLND_CONN_BULK_IN:
-               count = conn_cb->ksnr_blki_conn_count;
-               break;
-       case SOCKLND_CONN_BULK_OUT:
-               count = conn_cb->ksnr_blko_conn_count;
-               break;
-       case SOCKLND_CONN_ANY:
-               count = conn_cb->ksnr_conn_count;
-               break;
-       default:
-               LBUG();
-               break;
-       }
-
-       return count;
-}
-
-static unsigned int
 ksocknal_get_conns_per_peer(struct ksock_peer_ni *peer_ni)
 {
        struct lnet_ni *ni = peer_ni->ksnp_ni;
@@ -577,6 +550,33 @@ ksocknal_del_conn_cb_locked(struct ksock_conn_cb *conn_cb)
        }
 }
 
+unsigned int
+ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb,
+                               int type)
+{
+       unsigned int count = 0;
+
+       switch (type) {
+       case SOCKLND_CONN_CONTROL:
+               count = conn_cb->ksnr_ctrl_conn_count;
+               break;
+       case SOCKLND_CONN_BULK_IN:
+               count = conn_cb->ksnr_blki_conn_count;
+               break;
+       case SOCKLND_CONN_BULK_OUT:
+               count = conn_cb->ksnr_blko_conn_count;
+               break;
+       case SOCKLND_CONN_ANY:
+               count = conn_cb->ksnr_conn_count;
+               break;
+       default:
+               LBUG();
+               break;
+       }
+
+       return count;
+}
+
 int
 ksocknal_add_peer(struct lnet_ni *ni, struct lnet_processid *id,
                  struct sockaddr *addr)
index f34ffec..7c6a18d 100644 (file)
@@ -607,6 +607,8 @@ int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg,
                   unsigned int offset, unsigned int mlen, unsigned int rlen);
 int ksocknal_accept(struct lnet_ni *ni, struct socket *sock);
 
+unsigned int ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb,
+                                            int type);
 int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_processid *id,
                      struct sockaddr *addr);
 struct ksock_peer_ni *ksocknal_find_peer_locked(struct lnet_ni *ni,
index 93e97f8..9ccf557 100644 (file)
@@ -2001,7 +2001,8 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb)
                        goto failed;
                }
 
-               if (rc == EALREADY && conn_cb->ksnr_conn_count > 0)
+               if (rc == EALREADY &&
+                   ksocknal_get_conn_count_by_type(conn_cb, type) > 0)
                        conn_cb->ksnr_busy_retry_count += 1;
                else
                        conn_cb->ksnr_busy_retry_count = 0;