From b14cd24f1270e4240f2ac05df4fd6692d07bc942 Mon Sep 17 00:00:00 2001 From: Bruno Faccini Date: Tue, 10 Dec 2013 10:55:59 +0100 Subject: [PATCH] LU-3834 mdt: handle swap_layouts failures during restore Actually nothing occur after swap_layouts failures during restore, this can lead to file being left in incoherent state and thus be unavailable because HS_RELEASED is clear but LOV_PATTERN_F_RELEASED is still set. This patch will allow original layout to be recovered by the use of SWAP_LAYOUTS_MDS_HSM flag. Additionaly this requires HSM xattr of the data FID to be set. Also adds layout-swap failure injection and related test. Signed-off-by: Bruno Faccini Change-Id: Id0e9a005362e4a3854b33f6ce1888197d20e7dbf Reviewed-on: http://review.whamcloud.com/7631 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Aurelien Degremont Reviewed-by: jacques-Charles Lafoucriere Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/mdd/mdd_object.c | 19 +++++++++++------- lustre/mdt/mdt_coordinator.c | 35 ++++++++++++++++++++++++--------- lustre/tests/sanity-hsm.sh | 47 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 16 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index db75316..fb1b2b1 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -242,6 +242,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_MDS_SWAP_LAYOUTS_NET 0x14f #define OBD_FAIL_MDS_HSM_ACTION_NET 0x150 #define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 +#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS 0x152 /* layout lock */ #define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c index 52a0cf6..891628c 100644 --- a/lustre/mdd/mdd_object.c +++ b/lustre/mdd/mdd_object.c @@ -1468,13 +1468,18 @@ static int mdd_swap_layouts(const struct lu_env *env, struct md_object *obj1, if (rc != 0) GOTO(stop, rc); - if (fst_buf->lb_buf != NULL) - rc = mdo_xattr_set(env, snd_o, fst_buf, XATTR_NAME_LOV, - LU_XATTR_REPLACE, handle, - mdd_object_capa(env, snd_o)); - else - rc = mdo_xattr_del(env, snd_o, XATTR_NAME_LOV, handle, - mdd_object_capa(env, snd_o)); + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_MDS_HSM_SWAP_LAYOUTS))) { + rc = -EOPNOTSUPP; + } else { + if (fst_buf->lb_buf != NULL) + rc = mdo_xattr_set(env, snd_o, fst_buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle, + mdd_object_capa(env, snd_o)); + else + rc = mdo_xattr_del(env, snd_o, XATTR_NAME_LOV, handle, + mdd_object_capa(env, snd_o)); + } + if (rc != 0) { int steps = 0; diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c index 3915e21..037bcbd 100644 --- a/lustre/mdt/mdt_coordinator.c +++ b/lustre/mdt/mdt_coordinator.c @@ -1157,9 +1157,11 @@ out: * \param mti [IN] context * \param fid1 [IN] * \param fid2 [IN] + * \param mh_common [IN] MD HSM */ static int hsm_swap_layouts(struct mdt_thread_info *mti, - const lustre_fid *fid, const lustre_fid *dfid) + const lustre_fid *fid, const lustre_fid *dfid, + struct md_hsm *mh_common) { struct mdt_device *mdt = mti->mti_mdt; struct mdt_object *child1, *child2; @@ -1182,15 +1184,28 @@ static int hsm_swap_layouts(struct mdt_thread_info *mti, /* if copy tool closes the volatile before sending the final * progress through llapi_hsm_copy_end(), all the objects * are removed and mdd_swap_layout LBUG */ - if (mdt_object_exists(child2)) { - rc = mo_swap_layouts(mti->mti_env, mdt_object_child(child1), - mdt_object_child(child2), 0); - } else { + if (!mdt_object_exists(child2)) { CERROR("%s: Copytool has closed volatile file "DFID"\n", mdt_obd_name(mti->mti_mdt), PFID(dfid)); - rc = -ENOENT; + GOTO(out_child2, rc = -ENOENT); } + /* Since we only handle restores here, unconditionally use + * SWAP_LAYOUTS_MDS_HSM flag to ensure original layout will + * be preserved in case of failure during swap_layout and not + * leave a file in an intermediate but incoherent state. + * But need to setup HSM xattr of data FID before, reuse + * mti and mh presets for FID in hsm_cdt_request_completed(), + * only need to clear RELEASED and DIRTY. + */ + mh_common->mh_flags &= ~(HS_RELEASED | HS_DIRTY); + rc = mdt_hsm_attr_set(mti, child2, mh_common); + if (rc == 0) + rc = mo_swap_layouts(mti->mti_env, + mdt_object_child(child1), + mdt_object_child(child2), + SWAP_LAYOUTS_MDS_HSM); +out_child2: mdt_object_unlock_put(mti, child2, lh2, 1); out_child1: mdt_object_put(mti->mti_env, child1); @@ -1320,8 +1335,10 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti, case HSMA_RESTORE: hsm_set_cl_event(&cl_flags, HE_RESTORE); - /* clear RELEASED and DIRTY */ - mh.mh_flags &= ~(HS_RELEASED | HS_DIRTY); + /* do not clear RELEASED and DIRTY here + * this will occur in hsm_swap_layouts() + */ + /* Restoring has changed the file version on * disk. */ mh.mh_arch_ver = pgs->hpk_data_version; @@ -1378,7 +1395,7 @@ unlock: * only if restore is successfull */ if (pgs->hpk_errval == 0) { rc = hsm_swap_layouts(mti, &car->car_hai->hai_fid, - &car->car_hai->hai_dfid); + &car->car_hai->hai_dfid, &mh); if (rc) { if (cdt->cdt_policy & CDT_NORETRY_ACTION) *status = ARS_FAILED; diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 1aa09b9..4ef21eb 100644 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -1155,6 +1155,53 @@ test_12n() { } run_test 12n "Import/implicit restore/release" +test_12o() { + # test needs a running copytool + copytool_setup + + mkdir -p $DIR/$tdir + local f=$DIR/$tdir/$tfile + local fid=$(copy_file /etc/hosts $f) + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + $LFS hsm_release $f || error "release of $f failed" + +#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS 0x152 + do_facet $SINGLEMDS lctl set_param fail_loc=0x152 + + # set no retry action mode + cdt_set_no_retry + + diff -q /etc/hosts $f + local st=$? + + # we check we had a restore failure + wait_request_state $fid RESTORE FAILED + + [[ $st -eq 0 ]] && error "Restore must fail" + + # remove no retry action mode + cdt_clear_no_retry + + # check file is still released + check_hsm_flags $f "0x0000000d" + + # retry w/o failure injection + do_facet $SINGLEMDS lctl set_param fail_loc=0 + + diff -q /etc/hosts $f + st=$? + + # we check we had a restore done + wait_request_state $fid RESTORE SUCCEED + + [[ $st -eq 0 ]] || error "Restored file differs" + + copytool_cleanup +} +run_test 12o "Layout-swap failure during Restore leaves file released" + test_13() { # test needs a running copytool copytool_setup -- 1.8.3.1