From 5afe3b0538c533c3cca370bc9c0901abccca299a Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Thu, 2 Nov 2023 12:28:45 -0700 Subject: [PATCH] LU-17258 socklnd: ensure connection type established upon race When a connection race is hit between two peers, only increment the retry count if a connection of the specific type has already been established; otherwise, this can lead to an unexpected value set in ksnr_connected and some of the assertions being triggered in ksocknal_connect(): "ASSERTION( (wanted & ((((1UL))) << (3))) != 0 ) failed" Fixes: da893c6c97 ("LU-16191 socklnd: limit retries on conns_per_peer mismatch") HPE-bug-id: LUS-11922 Signed-off-by: Chris Horn Signed-off-by: Nikitas Angelinas Change-Id: I6e8abb39ad3c0bcd7fbc8f8c5478c903029df908 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/52957 Reviewed-by: James Simmons Reviewed-by: Oleg Drokin Reviewed-by: Serguei Smirnov Tested-by: Maloo Tested-by: jenkins --- lnet/klnds/socklnd/socklnd.c | 54 ++++++++++++++++++++--------------------- lnet/klnds/socklnd/socklnd.h | 2 ++ lnet/klnds/socklnd/socklnd_cb.c | 3 ++- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 3991dc03..e48a1d7 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -380,33 +380,6 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index, } static unsigned int -ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb, - int type) -{ - unsigned int count = 0; - - switch (type) { - case SOCKLND_CONN_CONTROL: - count = conn_cb->ksnr_ctrl_conn_count; - break; - case SOCKLND_CONN_BULK_IN: - count = conn_cb->ksnr_blki_conn_count; - break; - case SOCKLND_CONN_BULK_OUT: - count = conn_cb->ksnr_blko_conn_count; - break; - case SOCKLND_CONN_ANY: - count = conn_cb->ksnr_conn_count; - break; - default: - LBUG(); - break; - } - - return count; -} - -static unsigned int ksocknal_get_conns_per_peer(struct ksock_peer_ni *peer_ni) { struct lnet_ni *ni = peer_ni->ksnp_ni; @@ -577,6 +550,33 @@ ksocknal_del_conn_cb_locked(struct ksock_conn_cb *conn_cb) } } +unsigned int +ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb, + int type) +{ + unsigned int count = 0; + + switch (type) { + case SOCKLND_CONN_CONTROL: + count = conn_cb->ksnr_ctrl_conn_count; + break; + case SOCKLND_CONN_BULK_IN: + count = conn_cb->ksnr_blki_conn_count; + break; + case SOCKLND_CONN_BULK_OUT: + count = conn_cb->ksnr_blko_conn_count; + break; + case SOCKLND_CONN_ANY: + count = conn_cb->ksnr_conn_count; + break; + default: + LBUG(); + break; + } + + return count; +} + int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_processid *id, struct sockaddr *addr) diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index f34ffec..7c6a18d 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -607,6 +607,8 @@ int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, unsigned int offset, unsigned int mlen, unsigned int rlen); int ksocknal_accept(struct lnet_ni *ni, struct socket *sock); +unsigned int ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb, + int type); int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_processid *id, struct sockaddr *addr); struct ksock_peer_ni *ksocknal_find_peer_locked(struct lnet_ni *ni, diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 93e97f8..9ccf557 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -2001,7 +2001,8 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) goto failed; } - if (rc == EALREADY && conn_cb->ksnr_conn_count > 0) + if (rc == EALREADY && + ksocknal_get_conn_count_by_type(conn_cb, type) > 0) conn_cb->ksnr_busy_retry_count += 1; else conn_cb->ksnr_busy_retry_count = 0; -- 1.8.3.1