From 37a5157b84bce367e31743cb8648a15618492531 Mon Sep 17 00:00:00 2001 From: Sergey Cheremencev Date: Mon, 20 Mar 2017 22:20:40 +0800 Subject: [PATCH] LU-9266 hsm: don't add request when cdt is stopped Check cdt_state after getting layout lock in mdt_hsm_add_actions. Fix protects against several RESTORE records addressed to the same object in llog. Such records causes mount to hung when starting hsm: D: 15524 TASK: ffff880068b5b540 CPU: 4 COMMAND: "lctl" #0 [ffff8800bacd9728] schedule at ffffffff81525d30 #1 [ffff8800bacd97f0] ldlm_completion_ast at ffffffffa08527f5 [ptlrpc] #2 [ffff8800bacd9890] ldlm_cli_enqueue_local at ffffffffa0851b8e [ptlrpc] #3 [ffff8800bacd9910] mdt_object_lock0 at ffffffffa0e4ec4c [mdt] #4 [ffff8800bacd99c0] mdt_object_lock at ffffffffa0e4f694 [mdt] #5 [ffff8800bacd99d0] mdt_object_find_lock at ffffffffa0e4f9c1 [mdt] #6 [ffff8800bacd9a00] hsm_restore_cb at ffffffffa0e9b533 [mdt] #7 [ffff8800bacd9a50] llog_process_thread at ffffffffa05fd699 [obdclass] #8 [ffff8800bacd9b10] llog_process_or_fork at ffffffffa05fdbaf [obdclass] #9 [ffff8800bacd9b60] llog_cat_process_cb at ffffffffa0601250 [obdclass] Change-Id: Ib09139795d847cac2e5f079a192a3548d32db09c Seagate-bug-id: MRP-4251 Signed-off-by: Sergey Cheremencev Signed-off-by: Hongchao Zhang Reviewed-on: https://review.whamcloud.com/26215 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Faccini Bruno Reviewed-by: John L. Hammond Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/mdt/mdt_hsm_cdt_client.c | 15 +++++++++++++++ lustre/tests/sanity-hsm.sh | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 3d1dae7..54843ee 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -249,6 +249,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_XATTR_REP 0x161 #define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162 #define OBD_FAIL_MDS_LOV_CREATE_RACE 0x163 +#define OBD_FAIL_MDS_HSM_CDT_DELAY 0x164 /* layout lock */ #define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 diff --git a/lustre/mdt/mdt_hsm_cdt_client.c b/lustre/mdt/mdt_hsm_cdt_client.c index e1f1673..aa4f123 100644 --- a/lustre/mdt/mdt_hsm_cdt_client.c +++ b/lustre/mdt/mdt_hsm_cdt_client.c @@ -446,10 +446,25 @@ int mdt_hsm_add_actions(struct mdt_thread_info *mti, mdt_object_put(mti->mti_env, obj); mutex_lock(&cdt->cdt_restore_lock); + if (unlikely((cdt->cdt_state == CDT_STOPPED) || + (cdt->cdt_state == CDT_STOPPING))) { + mutex_unlock(&cdt->cdt_restore_lock); + mdt_object_unlock(mti, NULL, &crh->crh_lh, 1); + OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem); + GOTO(out, rc = -EAGAIN); + } list_add_tail(&crh->crh_list, &cdt->cdt_restore_hdl); mutex_unlock(&cdt->cdt_restore_lock); } record: + /* + * Wait here to catch the 2nd RESTORE request to the same FID. + * Normally layout lock protects against adding such request. + * But when cdt is stopping it cancel all locks via + * ldlm_resource_clean and protections may not work. + * See LU-9266 and sanity-hsm_407 for details. + */ + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_HSM_CDT_DELAY, cfs_fail_val); /* record request */ rc = mdt_agent_record_add(mti->mti_env, mdt, compound_id, archive_id, flags, hai); diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 25c9cf8..8949a08 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -5264,6 +5264,45 @@ test_406() { } run_test 406 "attempting to migrate HSM archived files is safe" +test_407() { + needclients 2 || return 0 + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + + local f=$DIR/$tdir/$tfile + local f2=$DIR2/$tdir/$tfile + local fid + fid=$(make_custom_file_for_progress $f 39 1000000) + [ $? != 0 ] && skip "not enough free space" && return + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f + +#define OBD_FAIL_MDS_HSM_CDT_DELAY 0x164 + do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x164 + + md5sum $f & + # 1st request holds layout lock while appropriate + # RESTORE record is still not added to llog + md5sum $f2 & + sleep 2 + + # after umount hsm_actions->O/x/x log shouldn't have + # double RESTORE records like below + #[0x200000401:0x1:0x0]...0x58d03a0d/0x58d03a0c action=RESTORE...WAITING + #[0x200000401:0x1:0x0]...0x58d03a0c/0x58d03a0d action=RESTORE...WAITING + sleep 30 && + do_facet $SINGLEMDS "$LCTL get_param $HSM_PARAM.actions"& + fail $SINGLEMDS + + wait_request_state $fid RESTORE SUCCEED + copytool_cleanup +} +run_test 407 "Check for double RESTORE records in llog" + test_500() { [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.92) ] && -- 1.8.3.1