Whamcloud - gitweb
LU-14810 lnet: ongoing push when discovery is stopped 84/54884/3
authorCyril Bordage <cbordage@whamcloud.com>
Wed, 24 Apr 2024 02:21:53 +0000 (04:21 +0200)
committerOleg Drokin <green@whamcloud.com>
Tue, 21 May 2024 18:19:48 +0000 (18:19 +0000)
If a push is not completed when discovery thread is stopped, then we
still have ln_dc_handler used as md handler (from
lnet_peer_send_push). That leads to assert failure from
lnet_assert_handler_unused.

To fix that, we call lnet_assert_handler_unused only after the monitor
thread has been stopped. Thus, the patch for LU-17496 is not needed
anymore.

Fixes: 36b14a23a6 ("LU-17207 lnet: race b/w monitor thr stop and discovery push")
Test-Parameters: testlist=sanity-lnet env=ONLY="212 220",ONLY_REPEAT=100
Signed-off-by: Cyril Bordage <cbordage@whamcloud.com>
Change-Id: I426c37b12a3d29327a7295f528a5b875a9ac88a0
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54884
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Reviewed-by: Frank Sehr <fsehr@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/api.h
lnet/lnet/api-ni.c
lnet/lnet/lib-md.c
lnet/lnet/peer.c
lnet/selftest/rpc.c
lustre/ptlrpc/events.c

index c5db7f5..eacf730 100644 (file)
@@ -107,7 +107,7 @@ int LNetMDBind(const struct lnet_md *md_in,
 int __LNetMDUnlink(struct lnet_handle_md md_in, bool discard);
 #define LNetMDUnlink(handle) __LNetMDUnlink(handle, false)
 
-bool lnet_assert_handler_unused(lnet_handler_t handler, bool assert);
+void lnet_assert_handler_unused(lnet_handler_t handler);
 /** @} lnet_md */
 
 /** \defgroup lnet_data Data movement operations
index 8c12a8f..e2bb7c7 100644 (file)
@@ -1437,7 +1437,7 @@ lnet_unprepare(void)
                the_lnet.ln_mt_zombie_rstqs = NULL;
        }
 
-       lnet_assert_handler_unused(the_lnet.ln_mt_handler, true);
+       lnet_assert_handler_unused(the_lnet.ln_mt_handler);
        the_lnet.ln_mt_handler = NULL;
 
        lnet_portals_destroy();
@@ -2132,7 +2132,7 @@ lnet_ping_target_fini(void)
        lnet_ping_md_unlink(the_lnet.ln_ping_target,
                            &the_lnet.ln_ping_target_md);
 
-       lnet_assert_handler_unused(the_lnet.ln_ping_target_handler, true);
+       lnet_assert_handler_unused(the_lnet.ln_ping_target_handler);
        lnet_ping_target_destroy();
 }
 
@@ -2304,7 +2304,7 @@ static void lnet_push_target_fini(void)
        the_lnet.ln_push_target_nbytes = 0;
 
        LNetClearLazyPortal(LNET_RESERVED_PORTAL);
-       lnet_assert_handler_unused(the_lnet.ln_push_target_handler, true);
+       lnet_assert_handler_unused(the_lnet.ln_push_target_handler);
        the_lnet.ln_push_target_handler = NULL;
 }
 
@@ -3154,6 +3154,7 @@ LNetNIFini(void)
        if (the_lnet.ln_refcount != 1) {
                the_lnet.ln_refcount--;
        } else {
+               lnet_handler_t dc_handler = the_lnet.ln_dc_handler;
                LASSERT(!the_lnet.ln_niinit_self);
 
                lnet_net_lock(LNET_LOCK_EX);
@@ -3165,6 +3166,7 @@ LNetNIFini(void)
                lnet_router_debugfs_fini();
                lnet_peer_discovery_stop();
                lnet_monitor_thr_stop();
+               lnet_assert_handler_unused(dc_handler);
                lnet_push_target_fini();
                lnet_ping_target_fini();
 
index c834d48..d153003 100644 (file)
@@ -244,29 +244,22 @@ lnet_md_link(struct lnet_libmd *md, lnet_handler_t handler, int cpt)
        list_add(&md->md_list, &container->rec_active);
 }
 
-bool lnet_assert_handler_unused(lnet_handler_t handler, bool assert)
+void lnet_assert_handler_unused(lnet_handler_t handler)
 {
        struct lnet_res_container *container;
        int cpt;
-       bool handler_in_use = false;
 
        if (!handler)
-               return handler_in_use;
+               return;
        cfs_percpt_for_each(container, cpt, the_lnet.ln_md_containers) {
                struct lnet_libmd *md;
 
                lnet_res_lock(cpt);
                list_for_each_entry(md, &container->rec_active, md_list) {
-                       if (assert) {
-                               LASSERT(md->md_handler != handler);
-                       } else if (md->md_handler == handler) {
-                               handler_in_use = true;
-                               break;
-                       }
+                       LASSERT(md->md_handler != handler);
                }
                lnet_res_unlock(cpt);
        }
-       return handler_in_use;
 }
 EXPORT_SYMBOL(lnet_assert_handler_unused);
 
index f444eb3..1e6d0f1 100644 (file)
@@ -4044,7 +4044,6 @@ static void lnet_resend_msgs(void)
 static int lnet_peer_discovery(void *arg)
 {
        struct lnet_peer *lp;
-       int retry = 3;
        int rc;
 
        wait_for_completion(&the_lnet.ln_started);
@@ -4155,7 +4154,6 @@ static int lnet_peer_discovery(void *arg)
                lnet_net_unlock(LNET_LOCK_EX);
        }
 
-cleanup:
        CDEBUG(D_NET, "stopping\n");
        /*
         * Clean up before telling lnet_peer_discovery_stop() that
@@ -4197,9 +4195,6 @@ cleanup:
        }
        lnet_net_unlock(LNET_LOCK_EX);
 
-       if (lnet_assert_handler_unused(the_lnet.ln_dc_handler, --retry <= 0))
-               goto cleanup;
-
        the_lnet.ln_dc_handler = NULL;
 
        the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN;
index 93ad1fa..153f0e8 100644 (file)
@@ -1693,7 +1693,7 @@ srpc_shutdown (void)
                rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL);
                rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL);
                LASSERT(rc == 0);
-               lnet_assert_handler_unused(srpc_data.rpc_lnet_handler, true);
+               lnet_assert_handler_unused(srpc_data.rpc_lnet_handler);
                fallthrough;
 
        case SRPC_STATE_NI_INIT:
index 586fd3f..789bc75 100644 (file)
@@ -591,7 +591,7 @@ static void ptlrpc_ni_fini(void)
        percpu_ref_kill(&ptlrpc_pending);
        wait_for_completion(&ptlrpc_done);
 
-       lnet_assert_handler_unused(ptlrpc_handler, true);
+       lnet_assert_handler_unused(ptlrpc_handler);
        LNetNIFini();
 }