From e5e08e1c8e7bab742c6b92ef4c42f4003961531f Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Tue, 7 Feb 2023 12:35:33 +0300 Subject: [PATCH] LU-16536 osp: don't cleanup ldlm in precleanup phase instead do this in cleanup phase so that all OSPs have chance to abort in-flight RPCs which can block MDT thread holding LDLM locks. Lustre-change: https://review.whamcloud.com/49925 Lustre-commit: eed4d4c7523c26cfc5bc230986d96b2acf152dee Fixes: 226fd401f9 ("LU-7660 dne: support fs default stripe") Signed-off-by: Alex Zhuravlev Change-Id: Ib3714b29c514a7fa938d47717dc36525654407d6 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49926 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo --- lustre/include/obd_support.h | 1 + lustre/osp/osp_dev.c | 9 +++++++-- lustre/target/out_handler.c | 3 +++ lustre/tests/replay-single.sh | 23 +++++++++++++++++++++++ 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 6699559..a956266 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -682,6 +682,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OUT_UPDATE_DROP 0x1707 #define OBD_FAIL_OUT_OBJECT_MISS 0x1708 #define OBD_FAIL_OUT_EIO 0x1709 +#define OBD_FAIL_OUT_DROP_DESTROY 0x170b /* MIGRATE */ #define OBD_FAIL_MIGRATE_ENTRIES 0x1801 diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index 00d5989..f93099e 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -663,10 +663,15 @@ static int osp_process_config(const struct lu_env *env, case LCFG_PRE_CLEANUP: rc = osp_disconnect(d); osp_update_fini(env, d); - if (obd->obd_namespace != NULL) - ldlm_namespace_free_prior(obd->obd_namespace, NULL, 1); break; case LCFG_CLEANUP: + /* + * cleanup ldlm so that PRE_CLEANUP phase doesn't block + * awaiting for locks held by MDT threads awaiting for + * all OSPs to interrupt their in-flight RPCs + */ + if (obd->obd_namespace != NULL) + ldlm_namespace_free_prior(obd->obd_namespace, NULL, 1); lu_dev_del_linkage(dev->ld_site, dev); rc = osp_shutdown(env, d); break; diff --git a/lustre/target/out_handler.c b/lustre/target/out_handler.c index 702400e..a1470ea 100644 --- a/lustre/target/out_handler.c +++ b/lustre/target/out_handler.c @@ -616,6 +616,9 @@ static int out_destroy(struct tgt_session_info *tsi) tti->tti_u.update.tti_update_reply, tti->tti_u.update.tti_update_reply_index); + if (OBD_FAIL_CHECK(OBD_FAIL_OUT_DROP_DESTROY)) + tsi->tsi_pill->rc_req->rq_no_reply = 1; + RETURN(rc); } diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 77a8a3e..3a50ff1 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -5022,6 +5022,29 @@ test_135() { } run_test 135 "Server failure in lock replay phase" +test_136() { + (( $MDSCOUNT >= 3 )) || skip "needs > 2 MDTs" + (( MDS1_VERSION >= $(version_code 2.14.0.76) )) || + skip "need MDS version >= 2.14.0.76 for LU-16536 fix" + + $LFS mkdir -i0 -c3 $DIR/$tdir || error "can't mkdir" + $LFS getdirstripe $DIR/$tdir + sync;sync;sync + +#define OBD_FAIL_OUT_DROP_DESTROY 0x170b + local list=$(comma_list $(mdts_nodes)) + do_nodes $list $LCTL set_param fail_loc=0x1710 + rmdir $DIR/$tdir & + sleep 0.5 + stop mds2 + stop mds3 + stop mds1 + start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS || error "MDT1 start failed" + start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS || error "MDT2 start failed" + start mds3 $(mdsdevname 3) $MDS_MOUNT_OPTS || error "MDT3 star" +} +run_test 136 "MDS to disconnect all OSPs first, then cleanup ldlm" + complete $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1