From: Etienne AUJAMES Date: Thu, 21 Oct 2021 14:31:01 +0000 (+0200) Subject: LU-15132 hsm: Protect against parallel HSM restore requests X-Git-Tag: 2.15.51~47 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=66b3e74bccf1451d135b7f331459b6af1c06431b LU-15132 hsm: Protect against parallel HSM restore requests Multiple parallel accesses (read/write) to the same released file could cause multiple HSM restore requests to be sent. On the MDT side, each restore request waits the first one to complete before grabbing the MDS_INODELOCK_LAYOUT LCK_EX and registering the llog record. This could cause several MDT threads to hang for the same restore request sent in parallel. In the worst case, all MDT threads can hang and the MDS is not longer able to handle requests. This patch checks if an HSM restore handle exists before taking the lock. Test-Parameters: testlist=sanity-hsm,sanity-hsm Test-Parameters: testlist=sanity-hsm env=ONLY=12s,ONLY_REPEAT=50 Signed-off-by: Etienne AUJAMES Change-Id: I9584edc2c7411aa41b2e318e55f57c117d1c3dfb Reviewed-on: https://review.whamcloud.com/45367 Tested-by: jenkins Tested-by: Maloo Reviewed-by: John L. Hammond Reviewed-by: Nikitas Angelinas Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index f6aea9a..96cd590 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -267,6 +267,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_MDS_STRIPE_CREATE 0x188 #define OBD_FAIL_MDS_STRIPE_FID 0x189 #define OBD_FAIL_MDS_LINK_RENAME_RACE 0x18a +#define OBD_FAIL_MDS_HSM_RESTORE_RACE 0x18b /* OI scrub */ #define OBD_FAIL_OSD_SCRUB_DELAY 0x190 diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c index d0885d0..b321d94 100644 --- a/lustre/mdt/mdt_coordinator.c +++ b/lustre/mdt/mdt_coordinator.c @@ -488,6 +488,9 @@ static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt) mutex_lock(&cdt->cdt_restore_lock); list_for_each_entry_safe(crh, tmp3, &cdt->cdt_restore_handle_list, crh_list) { + /* not locked yet, cleanup by cdt_restore_handle_add() */ + if (crh->crh_lh.mlh_type == MDT_NUL_LOCK) + continue; list_del(&crh->crh_list); /* give back layout lock */ mdt_object_unlock(cdt_mti, NULL, &crh->crh_lh, 1); @@ -801,10 +804,21 @@ clean_cb_alloc: RETURN(rc); } +/** + * register a new HSM restore handle for a file and take EX lock on the layout + * \param mti [IN] thread info + * \param cdt [IN] coordinator + * \param fid [IN] fid of the file to restore + * \param he [IN] HSM extent + * \retval 0 success + * \retval 1 restore handle already exists for the fid + * \retval -ve failure + */ int cdt_restore_handle_add(struct mdt_thread_info *mti, struct coordinator *cdt, const struct lu_fid *fid, const struct hsm_extent *he) { + struct mdt_lock_handle lh = { 0 }; struct cdt_restore_handle *crh; struct mdt_object *obj; int rc; @@ -821,31 +835,48 @@ int cdt_restore_handle_add(struct mdt_thread_info *mti, struct coordinator *cdt, */ crh->crh_extent.start = 0; crh->crh_extent.end = he->length; + crh->crh_lh.mlh_type = MDT_NUL_LOCK; + + mutex_lock(&cdt->cdt_restore_lock); + if (cdt_restore_handle_find(cdt, fid) != NULL) + GOTO(out_crl, rc = 1); + + if (unlikely(cdt->cdt_state == CDT_STOPPED || + cdt->cdt_state == CDT_STOPPING)) + GOTO(out_crl, rc = -EAGAIN); + + list_add_tail(&crh->crh_list, &cdt->cdt_restore_handle_list); + mutex_unlock(&cdt->cdt_restore_lock); + /* get the layout lock */ - mdt_lock_reg_init(&crh->crh_lh, LCK_EX); - obj = mdt_object_find_lock(mti, &crh->crh_fid, &crh->crh_lh, + mdt_lock_reg_init(&lh, LCK_EX); + obj = mdt_object_find_lock(mti, &crh->crh_fid, &lh, MDS_INODELOCK_LAYOUT); - if (IS_ERR(obj)) - GOTO(out_crh, rc = PTR_ERR(obj)); + if (IS_ERR(obj)) { + mutex_lock(&cdt->cdt_restore_lock); + GOTO(out_ldel, rc = PTR_ERR(obj)); + } /* We do not keep a reference on the object during the restore - * which can be very long. */ + * which can be very long. + */ mdt_object_put(mti->mti_env, obj); mutex_lock(&cdt->cdt_restore_lock); if (unlikely(cdt->cdt_state == CDT_STOPPED || - cdt->cdt_state == CDT_STOPPING)) { - mutex_unlock(&cdt->cdt_restore_lock); + cdt->cdt_state == CDT_STOPPING)) GOTO(out_lh, rc = -EAGAIN); - } - list_add_tail(&crh->crh_list, &cdt->cdt_restore_handle_list); + crh->crh_lh = lh; mutex_unlock(&cdt->cdt_restore_lock); RETURN(0); out_lh: mdt_object_unlock(mti, NULL, &crh->crh_lh, 1); -out_crh: +out_ldel: + list_del(&crh->crh_list); +out_crl: + mutex_unlock(&cdt->cdt_restore_lock); OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem); return rc; @@ -953,6 +984,8 @@ static int hsm_restore_cb(const struct lu_env *env, } rc = cdt_restore_handle_add(mti, cdt, &hai->hai_fid, &hai->hai_extent); + if (rc == 1) + rc = 0; out: RETURN(rc); } diff --git a/lustre/mdt/mdt_hsm_cdt_client.c b/lustre/mdt/mdt_hsm_cdt_client.c index 9f78796..4e837d6 100644 --- a/lustre/mdt/mdt_hsm_cdt_client.c +++ b/lustre/mdt/mdt_hsm_cdt_client.c @@ -365,8 +365,16 @@ static int mdt_hsm_register_hal(struct mdt_thread_info *mti, if (hai->hai_extent.offset != 0) GOTO(out, rc = -EPROTO); + /* LU-15132 */ + OBD_RACE(OBD_FAIL_MDS_HSM_RESTORE_RACE); + rc = cdt_restore_handle_add(mti, cdt, &hai->hai_fid, &hai->hai_extent); + if (rc == 1) { + rc = 0; + continue; + } + if (rc < 0) GOTO(out, rc); } diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 1830789..b234123 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -1384,6 +1384,37 @@ test_12r() { } run_test 12r "lseek restores released file" +test_12s() { + local f=$DIR/$tdir/$tfile + local fid + local pid1 pid2 + + (( MDS1_VERSION >= $(version_code 2.15.50) )) || + skip "Need MDS version newer than 2.15.50" + + # test needs a running copytool + copytool setup + + mkdir_on_mdt0 $DIR/$tdir + fid=$(copy_file /etc/hosts $f) + + $LFS hsm_archive $f || error "archive of $f failed" + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "release of $f failed" + +#define OBD_FAIL_ONCE|OBD_FAIL_MDS_HSM_RESTORE_RACE 0x8000018b + do_facet mds1 $LCTL set_param fail_loc=0x8000018b + cat $f > /dev/null & pid1=$! + cat $f > /dev/null & pid2=$! + + wait $pid1 || error "cat process 1 fail (pid: $pid1)" + wait $pid2 || error "cat process 2 fail (pid: $pid2)" + + # Race exists if more than 1 restore requests is registered + assert_request_count $fid RESTORE 1 +} +run_test 12s "race between restore requests" + test_13() { local -i i j k=0 for i in {1..10}; do