From 3ba393a5cb21ff0f8bd8a09c341ee01e936321c7 Mon Sep 17 00:00:00 2001 From: Cyril Bordage Date: Wed, 24 Apr 2024 04:21:53 +0200 Subject: [PATCH] LU-14810 lnet: ongoing push when discovery is stopped If a push is not completed when discovery thread is stopped, then we still have ln_dc_handler used as md handler (from lnet_peer_send_push). That leads to assert failure from lnet_assert_handler_unused. To fix that, we call lnet_assert_handler_unused only after the monitor thread has been stopped. Thus, the patch for LU-17496 is not needed anymore. Fixes: 36b14a23a6 ("LU-17207 lnet: race b/w monitor thr stop and discovery push") Test-Parameters: testlist=sanity-lnet env=ONLY="212 220",ONLY_REPEAT=100 Signed-off-by: Cyril Bordage Change-Id: I426c37b12a3d29327a7295f528a5b875a9ac88a0 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54884 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Shaun Tancheff Reviewed-by: Frank Sehr Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- lnet/include/lnet/api.h | 2 +- lnet/lnet/api-ni.c | 8 +++++--- lnet/lnet/lib-md.c | 13 +++---------- lnet/lnet/peer.c | 5 ----- lnet/selftest/rpc.c | 2 +- lustre/ptlrpc/events.c | 2 +- 6 files changed, 11 insertions(+), 21 deletions(-) diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h index c5db7f5..eacf730 100644 --- a/lnet/include/lnet/api.h +++ b/lnet/include/lnet/api.h @@ -107,7 +107,7 @@ int LNetMDBind(const struct lnet_md *md_in, int __LNetMDUnlink(struct lnet_handle_md md_in, bool discard); #define LNetMDUnlink(handle) __LNetMDUnlink(handle, false) -bool lnet_assert_handler_unused(lnet_handler_t handler, bool assert); +void lnet_assert_handler_unused(lnet_handler_t handler); /** @} lnet_md */ /** \defgroup lnet_data Data movement operations diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 8c12a8f..e2bb7c7 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1437,7 +1437,7 @@ lnet_unprepare(void) the_lnet.ln_mt_zombie_rstqs = NULL; } - lnet_assert_handler_unused(the_lnet.ln_mt_handler, true); + lnet_assert_handler_unused(the_lnet.ln_mt_handler); the_lnet.ln_mt_handler = NULL; lnet_portals_destroy(); @@ -2132,7 +2132,7 @@ lnet_ping_target_fini(void) lnet_ping_md_unlink(the_lnet.ln_ping_target, &the_lnet.ln_ping_target_md); - lnet_assert_handler_unused(the_lnet.ln_ping_target_handler, true); + lnet_assert_handler_unused(the_lnet.ln_ping_target_handler); lnet_ping_target_destroy(); } @@ -2304,7 +2304,7 @@ static void lnet_push_target_fini(void) the_lnet.ln_push_target_nbytes = 0; LNetClearLazyPortal(LNET_RESERVED_PORTAL); - lnet_assert_handler_unused(the_lnet.ln_push_target_handler, true); + lnet_assert_handler_unused(the_lnet.ln_push_target_handler); the_lnet.ln_push_target_handler = NULL; } @@ -3154,6 +3154,7 @@ LNetNIFini(void) if (the_lnet.ln_refcount != 1) { the_lnet.ln_refcount--; } else { + lnet_handler_t dc_handler = the_lnet.ln_dc_handler; LASSERT(!the_lnet.ln_niinit_self); lnet_net_lock(LNET_LOCK_EX); @@ -3165,6 +3166,7 @@ LNetNIFini(void) lnet_router_debugfs_fini(); lnet_peer_discovery_stop(); lnet_monitor_thr_stop(); + lnet_assert_handler_unused(dc_handler); lnet_push_target_fini(); lnet_ping_target_fini(); diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index c834d48..d153003 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -244,29 +244,22 @@ lnet_md_link(struct lnet_libmd *md, lnet_handler_t handler, int cpt) list_add(&md->md_list, &container->rec_active); } -bool lnet_assert_handler_unused(lnet_handler_t handler, bool assert) +void lnet_assert_handler_unused(lnet_handler_t handler) { struct lnet_res_container *container; int cpt; - bool handler_in_use = false; if (!handler) - return handler_in_use; + return; cfs_percpt_for_each(container, cpt, the_lnet.ln_md_containers) { struct lnet_libmd *md; lnet_res_lock(cpt); list_for_each_entry(md, &container->rec_active, md_list) { - if (assert) { - LASSERT(md->md_handler != handler); - } else if (md->md_handler == handler) { - handler_in_use = true; - break; - } + LASSERT(md->md_handler != handler); } lnet_res_unlock(cpt); } - return handler_in_use; } EXPORT_SYMBOL(lnet_assert_handler_unused); diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index f444eb3..1e6d0f1 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -4044,7 +4044,6 @@ static void lnet_resend_msgs(void) static int lnet_peer_discovery(void *arg) { struct lnet_peer *lp; - int retry = 3; int rc; wait_for_completion(&the_lnet.ln_started); @@ -4155,7 +4154,6 @@ static int lnet_peer_discovery(void *arg) lnet_net_unlock(LNET_LOCK_EX); } -cleanup: CDEBUG(D_NET, "stopping\n"); /* * Clean up before telling lnet_peer_discovery_stop() that @@ -4197,9 +4195,6 @@ cleanup: } lnet_net_unlock(LNET_LOCK_EX); - if (lnet_assert_handler_unused(the_lnet.ln_dc_handler, --retry <= 0)) - goto cleanup; - the_lnet.ln_dc_handler = NULL; the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN; diff --git a/lnet/selftest/rpc.c b/lnet/selftest/rpc.c index 93ad1fa..153f0e8 100644 --- a/lnet/selftest/rpc.c +++ b/lnet/selftest/rpc.c @@ -1693,7 +1693,7 @@ srpc_shutdown (void) rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL); LASSERT(rc == 0); - lnet_assert_handler_unused(srpc_data.rpc_lnet_handler, true); + lnet_assert_handler_unused(srpc_data.rpc_lnet_handler); fallthrough; case SRPC_STATE_NI_INIT: diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 586fd3f..789bc75 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -591,7 +591,7 @@ static void ptlrpc_ni_fini(void) percpu_ref_kill(&ptlrpc_pending); wait_for_completion(&ptlrpc_done); - lnet_assert_handler_unused(ptlrpc_handler, true); + lnet_assert_handler_unused(ptlrpc_handler); LNetNIFini(); } -- 1.8.3.1