Whamcloud - gitweb
LU-16536 osp: don't cleanup ldlm in precleanup phase
authorAlex Zhuravlev <bzzz@whamcloud.com>
Tue, 7 Feb 2023 09:35:33 +0000 (12:35 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Tue, 14 Feb 2023 19:06:27 +0000 (19:06 +0000)
instead do this in cleanup phase so that all OSPs have chance
to abort in-flight RPCs which can block MDT thread holding
LDLM locks.

Lustre-change: https://review.whamcloud.com/49925
Lustre-commit: eed4d4c7523c26cfc5bc230986d96b2acf152dee

Fixes: 226fd401f9 ("LU-7660 dne: support fs default stripe")
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: Ib3714b29c514a7fa938d47717dc36525654407d6
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49926
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/include/obd_support.h
lustre/osp/osp_dev.c
lustre/target/out_handler.c
lustre/tests/replay-single.sh

index 6699559..a956266 100644 (file)
@@ -682,6 +682,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OUT_UPDATE_DROP        0x1707
 #define OBD_FAIL_OUT_OBJECT_MISS       0x1708
 #define OBD_FAIL_OUT_EIO               0x1709
+#define OBD_FAIL_OUT_DROP_DESTROY      0x170b
 
 /* MIGRATE */
 #define OBD_FAIL_MIGRATE_ENTRIES               0x1801
index 00d5989..f93099e 100644 (file)
@@ -663,10 +663,15 @@ static int osp_process_config(const struct lu_env *env,
        case LCFG_PRE_CLEANUP:
                rc = osp_disconnect(d);
                osp_update_fini(env, d);
-               if (obd->obd_namespace != NULL)
-                       ldlm_namespace_free_prior(obd->obd_namespace, NULL, 1);
                break;
        case LCFG_CLEANUP:
+               /*
+                * cleanup ldlm so that PRE_CLEANUP phase doesn't block
+                * awaiting for locks held by MDT threads awaiting for
+                * all OSPs to interrupt their in-flight RPCs
+                */
+               if (obd->obd_namespace != NULL)
+                       ldlm_namespace_free_prior(obd->obd_namespace, NULL, 1);
                lu_dev_del_linkage(dev->ld_site, dev);
                rc = osp_shutdown(env, d);
                break;
index 702400e..a1470ea 100644 (file)
@@ -616,6 +616,9 @@ static int out_destroy(struct tgt_session_info *tsi)
                            tti->tti_u.update.tti_update_reply,
                            tti->tti_u.update.tti_update_reply_index);
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_OUT_DROP_DESTROY))
+               tsi->tsi_pill->rc_req->rq_no_reply = 1;
+
        RETURN(rc);
 }
 
index 77a8a3e..3a50ff1 100755 (executable)
@@ -5022,6 +5022,29 @@ test_135() {
 }
 run_test 135 "Server failure in lock replay phase"
 
+test_136() {
+       (( $MDSCOUNT >= 3 )) || skip "needs > 2 MDTs"
+       (( MDS1_VERSION >= $(version_code 2.14.0.76) )) ||
+               skip "need MDS version >= 2.14.0.76 for LU-16536 fix"
+
+       $LFS mkdir -i0 -c3 $DIR/$tdir || error "can't mkdir"
+       $LFS getdirstripe $DIR/$tdir
+       sync;sync;sync
+
+#define OBD_FAIL_OUT_DROP_DESTROY      0x170b
+       local list=$(comma_list $(mdts_nodes))
+       do_nodes $list $LCTL set_param fail_loc=0x1710
+       rmdir $DIR/$tdir &
+       sleep 0.5
+       stop mds2
+       stop mds3
+       stop mds1
+       start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS || error "MDT1 start failed"
+       start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS || error "MDT2 start failed"
+       start mds3 $(mdsdevname 3) $MDS_MOUNT_OPTS || error "MDT3 star"
+}
+run_test 136 "MDS to disconnect all OSPs first, then cleanup ldlm"
+
 complete $SECONDS
 check_and_cleanup_lustre
 exit_status