From 0c91d49a44e1214b5c65d4a557f6969b3d217881 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Thu, 12 May 2022 13:16:10 -0500 Subject: [PATCH] LU-15860 socklnd: Duplicate ksock_conn_cb If two threads enter ksocknal_add_peer(), the first one to acquire the ksnd_global_lock will create a ksock_peer_ni and associate a ksock_conn_cb with it. When the second thread acquires the ksnd_global_lock it will find the existing ksock_peer_ni, but it does not check for an existing ksock_conn_cb. As a result, it overwrites the existing ksock_conn_cb (ksock_peer_ni::ksnp_conn_cb) and the ksock_conn_cb from the first thread becomes stranded. Modify ksocknal_add_peer() to check whether the peer_ni has an existing ksock_conn_cb associated with it Fixes: 7766f01e89 ("LU-13641 socklnd: replace route construct") HPE-bug-id: LUS-10956 Test-Parameters: trivial Signed-off-by: Chris Horn Change-Id: I6c0190a0c1d3321ddd85c763b86ad1f0d32cf2b9 Reviewed-on: https://review.whamcloud.com/47361 Tested-by: jenkins Reviewed-by: Frank Sehr Tested-by: Maloo Reviewed-by: Andriy Skulysh Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- lnet/klnds/socklnd/socklnd.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index bfe6a02..f3fff31 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -645,14 +645,17 @@ ksocknal_add_peer(struct lnet_ni *ni, struct lnet_processid *id, nidhash(&id->nid)); } - ksocknal_add_conn_cb_locked(peer_ni, conn_cb); - - /* Remember conns_per_peer setting at the time - * of connection initiation. It will define the - * max number of conns per type for this conn_cb - * while it's in use. - */ - conn_cb->ksnr_max_conns = ksocknal_get_conns_per_peer(peer_ni); + if (peer_ni->ksnp_conn_cb) { + ksocknal_conn_cb_decref(conn_cb); + } else { + ksocknal_add_conn_cb_locked(peer_ni, conn_cb); + /* Remember conns_per_peer setting at the time + * of connection initiation. It will define the + * max number of conns per type for this conn_cb + * while it's in use. + */ + conn_cb->ksnr_max_conns = ksocknal_get_conns_per_peer(peer_ni); + } write_unlock_bh(&ksocknal_data.ksnd_global_lock); -- 1.8.3.1