Whamcloud - gitweb
LU-3834 mdt: handle swap_layouts failures during restore 31/7631/14
authorBruno Faccini <bruno.faccini@intel.com>
Tue, 10 Dec 2013 09:55:59 +0000 (10:55 +0100)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 14 Jan 2014 17:33:52 +0000 (17:33 +0000)
Actually nothing occur after swap_layouts failures during restore,
this can lead to file being left in incoherent state and thus be
unavailable because HS_RELEASED is clear but LOV_PATTERN_F_RELEASED
is still set.
This patch will allow original layout to be recovered by the use of
SWAP_LAYOUTS_MDS_HSM flag. Additionaly this requires HSM xattr of
the data FID to be set.
Also adds layout-swap failure injection and related test.

Signed-off-by: Bruno Faccini <bruno.faccini@intel.com>
Change-Id: Id0e9a005362e4a3854b33f6ce1888197d20e7dbf
Reviewed-on: http://review.whamcloud.com/7631
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Aurelien Degremont <aurelien.degremont@cea.fr>
Reviewed-by: jacques-Charles Lafoucriere <jacques-charles.lafoucriere@cea.fr>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/include/obd_support.h
lustre/mdd/mdd_object.c
lustre/mdt/mdt_coordinator.c
lustre/tests/sanity-hsm.sh

index db75316..fb1b2b1 100644 (file)
@@ -242,6 +242,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_MDS_SWAP_LAYOUTS_NET          0x14f
 #define OBD_FAIL_MDS_HSM_ACTION_NET            0x150
 #define OBD_FAIL_MDS_CHANGELOG_INIT            0x151
+#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS          0x152
 
 /* layout lock */
 #define OBD_FAIL_MDS_NO_LL_GETATTR      0x170
index 52a0cf6..891628c 100644 (file)
@@ -1468,13 +1468,18 @@ static int mdd_swap_layouts(const struct lu_env *env, struct md_object *obj1,
        if (rc != 0)
                GOTO(stop, rc);
 
-       if (fst_buf->lb_buf != NULL)
-               rc = mdo_xattr_set(env, snd_o, fst_buf, XATTR_NAME_LOV,
-                                  LU_XATTR_REPLACE, handle,
-                                  mdd_object_capa(env, snd_o));
-       else
-               rc = mdo_xattr_del(env, snd_o, XATTR_NAME_LOV, handle,
-                                  mdd_object_capa(env, snd_o));
+       if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_MDS_HSM_SWAP_LAYOUTS))) {
+               rc = -EOPNOTSUPP;
+       } else {
+               if (fst_buf->lb_buf != NULL)
+                       rc = mdo_xattr_set(env, snd_o, fst_buf, XATTR_NAME_LOV,
+                                          LU_XATTR_REPLACE, handle,
+                                          mdd_object_capa(env, snd_o));
+               else
+                       rc = mdo_xattr_del(env, snd_o, XATTR_NAME_LOV, handle,
+                                          mdd_object_capa(env, snd_o));
+       }
+
        if (rc != 0) {
                int steps = 0;
 
index 3915e21..037bcbd 100644 (file)
@@ -1157,9 +1157,11 @@ out:
  * \param mti [IN] context
  * \param fid1 [IN]
  * \param fid2 [IN]
+ * \param mh_common [IN] MD HSM
  */
 static int hsm_swap_layouts(struct mdt_thread_info *mti,
-                           const lustre_fid *fid, const lustre_fid *dfid)
+                           const lustre_fid *fid, const lustre_fid *dfid,
+                           struct md_hsm *mh_common)
 {
        struct mdt_device       *mdt = mti->mti_mdt;
        struct mdt_object       *child1, *child2;
@@ -1182,15 +1184,28 @@ static int hsm_swap_layouts(struct mdt_thread_info *mti,
        /* if copy tool closes the volatile before sending the final
         * progress through llapi_hsm_copy_end(), all the objects
         * are removed and mdd_swap_layout LBUG */
-       if (mdt_object_exists(child2)) {
-               rc = mo_swap_layouts(mti->mti_env, mdt_object_child(child1),
-                                    mdt_object_child(child2), 0);
-       } else {
+       if (!mdt_object_exists(child2)) {
                CERROR("%s: Copytool has closed volatile file "DFID"\n",
                       mdt_obd_name(mti->mti_mdt), PFID(dfid));
-               rc = -ENOENT;
+               GOTO(out_child2, rc = -ENOENT);
        }
+       /* Since we only handle restores here, unconditionally use
+        * SWAP_LAYOUTS_MDS_HSM flag to ensure original layout will
+        * be preserved in case of failure during swap_layout and not
+        * leave a file in an intermediate but incoherent state.
+        * But need to setup HSM xattr of data FID before, reuse
+        * mti and mh presets for FID in hsm_cdt_request_completed(),
+        * only need to clear RELEASED and DIRTY.
+        */
+       mh_common->mh_flags &= ~(HS_RELEASED | HS_DIRTY);
+       rc = mdt_hsm_attr_set(mti, child2, mh_common);
+       if (rc == 0)
+               rc = mo_swap_layouts(mti->mti_env,
+                                    mdt_object_child(child1),
+                                    mdt_object_child(child2),
+                                    SWAP_LAYOUTS_MDS_HSM);
 
+out_child2:
        mdt_object_unlock_put(mti, child2, lh2, 1);
 out_child1:
        mdt_object_put(mti->mti_env, child1);
@@ -1320,8 +1335,10 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
                case HSMA_RESTORE:
                        hsm_set_cl_event(&cl_flags, HE_RESTORE);
 
-                       /* clear RELEASED and DIRTY */
-                       mh.mh_flags &= ~(HS_RELEASED | HS_DIRTY);
+                       /* do not clear RELEASED and DIRTY here
+                        * this will occur in hsm_swap_layouts()
+                        */
+
                        /* Restoring has changed the file version on
                         * disk. */
                        mh.mh_arch_ver = pgs->hpk_data_version;
@@ -1378,7 +1395,7 @@ unlock:
                 * only if restore is successfull */
                if (pgs->hpk_errval == 0) {
                        rc = hsm_swap_layouts(mti, &car->car_hai->hai_fid,
-                                             &car->car_hai->hai_dfid);
+                                             &car->car_hai->hai_dfid, &mh);
                        if (rc) {
                                if (cdt->cdt_policy & CDT_NORETRY_ACTION)
                                        *status = ARS_FAILED;
index 1aa09b9..4ef21eb 100644 (file)
@@ -1155,6 +1155,53 @@ test_12n() {
 }
 run_test 12n "Import/implicit restore/release"
 
+test_12o() {
+       # test needs a running copytool
+       copytool_setup
+
+       mkdir -p $DIR/$tdir
+       local f=$DIR/$tdir/$tfile
+       local fid=$(copy_file /etc/hosts $f)
+
+       $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f
+       wait_request_state $fid ARCHIVE SUCCEED
+       $LFS hsm_release $f || error "release of $f failed"
+
+#define OBD_FAIL_MDS_HSM_SWAP_LAYOUTS          0x152
+       do_facet $SINGLEMDS lctl set_param fail_loc=0x152
+
+       # set no retry action mode
+       cdt_set_no_retry
+
+       diff -q /etc/hosts $f
+       local st=$?
+
+       # we check we had a restore failure
+       wait_request_state $fid RESTORE FAILED
+
+       [[ $st -eq 0 ]] && error "Restore must fail"
+
+       # remove no retry action mode
+       cdt_clear_no_retry
+
+       # check file is still released
+       check_hsm_flags $f "0x0000000d"
+
+       # retry w/o failure injection
+       do_facet $SINGLEMDS lctl set_param fail_loc=0
+
+       diff -q /etc/hosts $f
+       st=$?
+
+       # we check we had a restore done
+       wait_request_state $fid RESTORE SUCCEED
+
+       [[ $st -eq 0 ]] || error "Restored file differs"
+
+       copytool_cleanup
+}
+run_test 12o "Layout-swap failure during Restore leaves file released"
+
 test_13() {
        # test needs a running copytool
        copytool_setup