Whamcloud - gitweb
LU-9266 hsm: don't add request when cdt is stopped 41/28441/2
authorSergey Cheremencev <sergey.cheremencev@seagate.com>
Mon, 20 Mar 2017 14:20:40 +0000 (22:20 +0800)
committerJohn L. Hammond <john.hammond@intel.com>
Wed, 16 Aug 2017 20:43:10 +0000 (20:43 +0000)
Check cdt_state after getting layout lock in mdt_hsm_add_actions.
Fix protects against several RESTORE records addressed to the
same object in llog. Such records causes mount to hung when
starting hsm:
D: 15524  TASK: ffff880068b5b540  CPU: 4   COMMAND: "lctl"
 #0 [ffff8800bacd9728] schedule at ffffffff81525d30
 #1 [ffff8800bacd97f0] ldlm_completion_ast at ffffffffa08527f5 [ptlrpc]
 #2 [ffff8800bacd9890] ldlm_cli_enqueue_local at ffffffffa0851b8e
[ptlrpc]
 #3 [ffff8800bacd9910] mdt_object_lock0 at ffffffffa0e4ec4c [mdt]
 #4 [ffff8800bacd99c0] mdt_object_lock at ffffffffa0e4f694 [mdt]
 #5 [ffff8800bacd99d0] mdt_object_find_lock at ffffffffa0e4f9c1 [mdt]
 #6 [ffff8800bacd9a00] hsm_restore_cb at ffffffffa0e9b533 [mdt]
 #7 [ffff8800bacd9a50] llog_process_thread at ffffffffa05fd699
[obdclass]
 #8 [ffff8800bacd9b10] llog_process_or_fork at ffffffffa05fdbaf
[obdclass]
 #9 [ffff8800bacd9b60] llog_cat_process_cb at ffffffffa0601250
[obdclass]

Lustre-change: https://review.whamcloud.com/26215
Lustre-commit: 37a5157b84bce367e31743cb8648a15618492531

Change-Id: Ib09139795d847cac2e5f079a192a3548d32db09c
Seagate-bug-id: MRP-4251
Signed-off-by: Sergey Cheremencev <sergey.cheremencev@seagate.com>
Signed-off-by: Hongchao Zhang <hongchao.zhang@intel.com>
Reviewed-by: Faccini Bruno <bruno.faccini@intel.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
Signed-off-by: Minh Diep <minh.diep@intel.com>
Reviewed-on: https://review.whamcloud.com/28441
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
lustre/include/obd_support.h
lustre/mdt/mdt_hsm_cdt_client.c
lustre/tests/sanity-hsm.sh

index 433c2f8..0130529 100644 (file)
@@ -249,6 +249,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_MDS_XATTR_REP                 0x161
 #define OBD_FAIL_MDS_TRACK_OVERFLOW     0x162
 #define OBD_FAIL_MDS_LOV_CREATE_RACE    0x163
+#define OBD_FAIL_MDS_HSM_CDT_DELAY      0x164
 
 /* layout lock */
 #define OBD_FAIL_MDS_NO_LL_GETATTR      0x170
index e1f1673..aa4f123 100644 (file)
@@ -446,10 +446,25 @@ int mdt_hsm_add_actions(struct mdt_thread_info *mti,
                        mdt_object_put(mti->mti_env, obj);
 
                        mutex_lock(&cdt->cdt_restore_lock);
+                       if (unlikely((cdt->cdt_state == CDT_STOPPED) ||
+                                    (cdt->cdt_state == CDT_STOPPING))) {
+                               mutex_unlock(&cdt->cdt_restore_lock);
+                               mdt_object_unlock(mti, NULL, &crh->crh_lh, 1);
+                               OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
+                               GOTO(out, rc = -EAGAIN);
+                       }
                        list_add_tail(&crh->crh_list, &cdt->cdt_restore_hdl);
                        mutex_unlock(&cdt->cdt_restore_lock);
                }
 record:
+               /*
+                * Wait here to catch the 2nd RESTORE request to the same FID.
+                * Normally layout lock protects against adding such request.
+                * But when cdt is stopping it cancel all locks via
+                * ldlm_resource_clean and protections may not work.
+                * See LU-9266 and sanity-hsm_407 for details.
+                */
+               OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_HSM_CDT_DELAY, cfs_fail_val);
                /* record request */
                rc = mdt_agent_record_add(mti->mti_env, mdt, compound_id,
                                          archive_id, flags, hai);
index 25c9cf8..8949a08 100755 (executable)
@@ -5264,6 +5264,45 @@ test_406() {
 }
 run_test 406 "attempting to migrate HSM archived files is safe"
 
+test_407() {
+       needclients 2 || return 0
+       # test needs a running copytool
+       copytool_setup
+
+       mkdir -p $DIR/$tdir
+
+       local f=$DIR/$tdir/$tfile
+       local f2=$DIR2/$tdir/$tfile
+       local fid
+       fid=$(make_custom_file_for_progress $f 39 1000000)
+       [ $? != 0 ] && skip "not enough free space" && return
+
+       $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f
+       wait_request_state $fid ARCHIVE SUCCEED
+       $LFS hsm_release $f
+
+#define OBD_FAIL_MDS_HSM_CDT_DELAY      0x164
+       do_facet $SINGLEMDS $LCTL set_param fail_val=5 fail_loc=0x164
+
+       md5sum $f &
+       # 1st request holds layout lock while appropriate
+       # RESTORE record is still not added to llog
+       md5sum $f2 &
+       sleep 2
+
+       # after umount hsm_actions->O/x/x log shouldn't have
+       # double RESTORE records like below
+       #[0x200000401:0x1:0x0]...0x58d03a0d/0x58d03a0c action=RESTORE...WAITING
+       #[0x200000401:0x1:0x0]...0x58d03a0c/0x58d03a0d action=RESTORE...WAITING
+       sleep 30 &&
+               do_facet $SINGLEMDS "$LCTL get_param $HSM_PARAM.actions"&
+       fail $SINGLEMDS
+
+       wait_request_state $fid RESTORE SUCCEED
+       copytool_cleanup
+}
+run_test 407 "Check for double RESTORE records in llog"
+
 test_500()
 {
        [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.92) ] &&