From eed4d4c7523c26cfc5bc230986d96b2acf152dee Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Tue, 7 Feb 2023 12:29:24 +0300 Subject: [PATCH] LU-16536 osp: don't cleanup ldlm in precleanup phase instead do this in cleanup phase so that all OSPs have chance to abort in-flight RPCs which can block MDT thread holding LDLM locks. Fixes: 226fd401f9 ("LU-7660 dne: support fs default stripe") Signed-off-by: Alex Zhuravlev Change-Id: Ib3714b29c514a7fa938d47717dc36525654407d6 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49925 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/osp/osp_dev.c | 9 +++++++-- lustre/target/out_handler.c | 3 +++ lustre/tests/replay-single.sh | 23 +++++++++++++++++++++++ 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 395118e..923a1f5 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -691,6 +691,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OUT_OBJECT_MISS 0x1708 #define OBD_FAIL_OUT_EIO 0x1709 #define OBD_FAIL_BUT_UPDATE_NET_REP 0x170a +#define OBD_FAIL_OUT_DROP_DESTROY 0x170b /* MIGRATE */ #define OBD_FAIL_MIGRATE_ENTRIES 0x1801 diff --git a/lustre/osp/osp_dev.c b/lustre/osp/osp_dev.c index 95c196e..5497028 100644 --- a/lustre/osp/osp_dev.c +++ b/lustre/osp/osp_dev.c @@ -662,10 +662,15 @@ static int osp_process_config(const struct lu_env *env, case LCFG_PRE_CLEANUP: rc = osp_disconnect(d); osp_update_fini(env, d); - if (obd->obd_namespace != NULL) - ldlm_namespace_free_prior(obd->obd_namespace, NULL, 1); break; case LCFG_CLEANUP: + /* + * cleanup ldlm so that PRE_CLEANUP phase doesn't block + * awaiting for locks held by MDT threads awaiting for + * all OSPs to interrupt their in-flight RPCs + */ + if (obd->obd_namespace != NULL) + ldlm_namespace_free_prior(obd->obd_namespace, NULL, 1); lu_dev_del_linkage(dev->ld_site, dev); rc = osp_shutdown(env, d); break; diff --git a/lustre/target/out_handler.c b/lustre/target/out_handler.c index c293050..49accc7 100644 --- a/lustre/target/out_handler.c +++ b/lustre/target/out_handler.c @@ -589,6 +589,9 @@ static int out_destroy(struct tgt_session_info *tsi) tti->tti_u.update.tti_update_reply, tti->tti_u.update.tti_update_reply_index); + if (OBD_FAIL_CHECK(OBD_FAIL_OUT_DROP_DESTROY)) + tsi->tsi_pill->rc_req->rq_no_reply = 1; + RETURN(rc); } diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 2240e8e..a156b79 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -4997,6 +4997,29 @@ test_135() { } run_test 135 "Server failure in lock replay phase" +test_136() { + (( $MDSCOUNT >= 3 )) || skip "needs > 2 MDTs" + (( MDS1_VERSION >= $(version_code 2.15.53) )) || + skip "need MDS version >= 2.15.53 for LU-16536 fix" + + $LFS mkdir -i0 -c3 $DIR/$tdir || error "can't mkdir" + $LFS getdirstripe $DIR/$tdir + sync;sync;sync + +#define OBD_FAIL_OUT_DROP_DESTROY 0x170b + local mdts=$(comma_list $(mdts_nodes)) + do_nodes $mdts $LCTL set_param fail_loc=0x170b + rmdir $DIR/$tdir & + sleep 0.5 + stop mds2 + stop mds3 + stop mds1 + start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS || error "MDT1 start failed" + start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS || error "MDT2 start failed" + start mds3 $(mdsdevname 3) $MDS_MOUNT_OPTS || error "MDT3 star" +} +run_test 136 "MDS to disconnect all OSPs first, then cleanup ldlm" + complete $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1