From 8371f6a75188488e6c9f4fc4d478d5fe9dfe0d1e Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Thu, 2 Nov 2023 12:28:45 -0700 Subject: [PATCH] LU-17258 socklnd: ensure connection type established upon race When a connection race is hit between two peers, only increment the retry count if a connection of the specific type has already been established; otherwise, this can lead to an unexpected value set in ksnr_connected and some of the assertions being triggered in ksocknal_connect(): "ASSERTION( (wanted & ((((1UL))) << (3))) != 0 ) failed" Lustre-change: https://review.whamcloud.com/52957 Lustre-commit: 5afe3b0538c533c3cca370bc9c0901abccca299a Fixes: da893c6c97 ("LU-16191 socklnd: limit retries on conns_per_peer mismatch") HPE-bug-id: LUS-11922 Signed-off-by: Chris Horn Signed-off-by: Nikitas Angelinas Change-Id: I6e8abb39ad3c0bcd7fbc8f8c5478c903029df908 Reviewed-by: James Simmons Reviewed-by: Serguei Smirnov Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53046 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- lnet/klnds/socklnd/socklnd.c | 54 ++++++++++++++++++++--------------------- lnet/klnds/socklnd/socklnd.h | 2 ++ lnet/klnds/socklnd/socklnd_cb.c | 3 ++- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 63ef433c..9001d05 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -344,33 +344,6 @@ ksocknal_get_peer_info(struct lnet_ni *ni, int index, } static unsigned int -ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb, - int type) -{ - unsigned int count = 0; - - switch (type) { - case SOCKLND_CONN_CONTROL: - count = conn_cb->ksnr_ctrl_conn_count; - break; - case SOCKLND_CONN_BULK_IN: - count = conn_cb->ksnr_blki_conn_count; - break; - case SOCKLND_CONN_BULK_OUT: - count = conn_cb->ksnr_blko_conn_count; - break; - case SOCKLND_CONN_ANY: - count = conn_cb->ksnr_conn_count; - break; - default: - LBUG(); - break; - } - - return count; -} - -static unsigned int ksocknal_get_conns_per_peer(struct ksock_peer_ni *peer_ni) { struct lnet_ni *ni = peer_ni->ksnp_ni; @@ -543,6 +516,33 @@ ksocknal_del_conn_cb_locked(struct ksock_conn_cb *conn_cb) } } +unsigned int +ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb, + int type) +{ + unsigned int count = 0; + + switch (type) { + case SOCKLND_CONN_CONTROL: + count = conn_cb->ksnr_ctrl_conn_count; + break; + case SOCKLND_CONN_BULK_IN: + count = conn_cb->ksnr_blki_conn_count; + break; + case SOCKLND_CONN_BULK_OUT: + count = conn_cb->ksnr_blko_conn_count; + break; + case SOCKLND_CONN_ANY: + count = conn_cb->ksnr_conn_count; + break; + default: + LBUG(); + break; + } + + return count; +} + int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr, int port) diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index cf209fd..296881a 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -600,6 +600,8 @@ int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, unsigned int offset, unsigned int mlen, unsigned int rlen); int ksocknal_accept(struct lnet_ni *ni, struct socket *sock); +unsigned int ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb, + int type); int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip, int port); struct ksock_peer_ni *ksocknal_find_peer_locked(struct lnet_ni *ni, diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index c50f25b..7943f49 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -1998,7 +1998,8 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb) goto failed; } - if (rc == EALREADY && conn_cb->ksnr_conn_count > 0) + if (rc == EALREADY && + ksocknal_get_conn_count_by_type(conn_cb, type) > 0) conn_cb->ksnr_busy_retry_count += 1; else conn_cb->ksnr_busy_retry_count = 0; -- 1.8.3.1