From 94d02e5774cc0d9ca5c3c34d21c2698ab89f3a6d Mon Sep 17 00:00:00 2001 From: Courrier Guillaume Date: Thu, 17 Nov 2022 13:15:19 +0100 Subject: [PATCH] LU-13048 mdd: allow release after a non-blocking migrate lfs setstripe -i0 file lfs hsm_archive file lfs migrate -n -i1 file lfs hsm_release file These actions lead to "Cannot send HSM request ...: Operation not permitted". This happens because of data version mismatch. This error is returned by mdt_hsm_release() when the data versions are not the same. This patch only corrects the non-blocking migrations. mdd_swap_layouts is updated to check and update the HSM archive version when possible. The new and old data versions are added as arguments to this function. If the old data version does not match the data version in the HSM attribute, we don't update the HSM attribute because we don't know what caused the inconsistency. During a swap between a volatile and a regular file, if both objects have an HSM xattr, mdd_swap_layouts was called from the MDT HSM layer (release and restore). In this case, we want to swap the HSM xattr (previously done using SWAP_LAYOUTS_MDS_HSM as a last argument to mdd_swap_layouts). If only the regular file has an HSM attribute, mdd_swap_layouts was called after a migration (blocking or not). In this case, we want to update the HSM archive version only if the file is not dirty and if the new data version is provided. Also, this patch removes the CL_LAYOUT event that was emitted for a release. Since a CL_HSM event with HE_RELEASE flag is also emitted, the CL_LAYOUT is unecessary. For "lfs swap_layouts", the operation is denied on 2 files with HSM xattr (HSM xattr swap will cause inconsistencies). With non-HSM file and archived file, the operation is allowed but the dirty flag is set on the HSM file. Add lustre_swab_close_data_special() to swab close_data fields inside the union (specific to some types of close). Add regression test sanity-hsm 607a, 607b and 607c. Test-Parameters: clientversion=2.15.4 testlist=sanity-hsm Test-Parameters: serverversion=2.15.4 testlist=sanity-hsm env=EXCEPT="114 409a" Test-Parameters: testlist=sanity-hsm env=ONLY=607,ONLY_REPEAT=15 Signed-off-by: Courrier Guillaume Signed-off-by: Etienne AUJAMES Change-Id: I6e90131235f96255b636eea366ad0cef5f4f0b19 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49236 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- lustre/include/lustre_swab.h | 1 + lustre/include/md_object.h | 8 +- lustre/include/obd.h | 3 +- lustre/include/uapi/linux/lustre/lustre_idl.h | 5 + lustre/include/uapi/linux/lustre/lustre_user.h | 5 +- lustre/llite/file.c | 45 +++- lustre/mdc/mdc_lib.c | 2 + lustre/mdd/mdd_object.c | 353 +++++++++++++++++++------ lustre/mdt/mdt_coordinator.c | 28 +- lustre/mdt/mdt_handler.c | 4 +- lustre/mdt/mdt_lib.c | 2 +- lustre/mdt/mdt_open.c | 28 +- lustre/ptlrpc/pack_generic.c | 16 +- lustre/tests/sanity-hsm.sh | 127 +++++++++ lustre/utils/lfs.c | 30 ++- 15 files changed, 522 insertions(+), 135 deletions(-) diff --git a/lustre/include/lustre_swab.h b/lustre/include/lustre_swab.h index 8dafadd..75f8b8c 100644 --- a/lustre/include/lustre_swab.h +++ b/lustre/include/lustre_swab.h @@ -128,6 +128,7 @@ void lustre_swab_batch_update_reply(struct batch_update_reply *bur); void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl); void lustre_swab_close_data(struct close_data *data); void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync); +void lustre_swab_close_data_special(struct close_data *cd, enum mds_op_bias b); void lustre_swab_lmv_user_md(struct lmv_user_md *lum); void lustre_swab_ladvise(struct lu_ladvise *ladvise); void lustre_swab_ladvise_hdr(struct ladvise_hdr *ladvise_hdr); diff --git a/lustre/include/md_object.h b/lustre/include/md_object.h index 80f5a18..a3133f4 100644 --- a/lustre/include/md_object.h +++ b/lustre/include/md_object.h @@ -256,7 +256,7 @@ struct md_object_operations { /** This method is used to swap the layouts between 2 objects */ int (*moo_swap_layouts)(const struct lu_env *env, struct md_object *obj1, struct md_object *obj2, - __u64 flags); + __u64 dv1, __u64 dv2, __u64 flags); /** \retval number of bytes actually read upon success */ int (*moo_readpage)(const struct lu_env *env, struct md_object *obj, @@ -532,14 +532,14 @@ static inline int mo_layout_change(const struct lu_env *env, } static inline int mo_swap_layouts(const struct lu_env *env, - struct md_object *o1, - struct md_object *o2, __u64 flags) + struct md_object *o1, struct md_object *o2, + __u64 dv1, __u64 dv2, __u64 flags) { LASSERT(o1->mo_ops->moo_swap_layouts); LASSERT(o2->mo_ops->moo_swap_layouts); if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts) return -EPERM; - return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags); + return o1->mo_ops->moo_swap_layouts(env, o1, o2, dv1, dv2, flags); } static inline int mo_open(const struct lu_env *env, struct md_object *m, diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 6ed6020..9f38cff 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -948,8 +948,9 @@ struct md_op_data { * see enum op_cli_flags */ enum md_cli_flags op_cli_flags; - /* File object data version for HSM release, on client */ + /* File object data version for HSM release or migrate, on client */ __u64 op_data_version; + __u64 op_data_version2; struct lustre_handle op_lease_handle; /* File security context, for creates/metadata ops */ diff --git a/lustre/include/uapi/linux/lustre/lustre_idl.h b/lustre/include/uapi/linux/lustre/lustre_idl.h index e63b4d2..3b012b8 100644 --- a/lustre/include/uapi/linux/lustre/lustre_idl.h +++ b/lustre/include/uapi/linux/lustre/lustre_idl.h @@ -2028,6 +2028,9 @@ enum mds_op_bias { MDS_MIGRATE_NSONLY = 1 << 23, /* create with default LMV from client */ MDS_CREATE_DEFAULT_LMV = 1 << 24, + /* Compat flag with clients that do not send old and new data version + * after swap layout */ + MDS_CLOSE_LAYOUT_SWAP_HSM = 1 << 25, }; #define MDS_CLOSE_INTENT (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP | \ @@ -3674,6 +3677,8 @@ struct close_data { __u16 cd_mirror_id; /* PCC release */ __u32 cd_archive_id; + /* migrate swap layout */ + __u64 cd_data_version2; }; }; diff --git a/lustre/include/uapi/linux/lustre/lustre_user.h b/lustre/include/uapi/linux/lustre/lustre_user.h index ace28bb..b089e92 100644 --- a/lustre/include/uapi/linux/lustre/lustre_user.h +++ b/lustre/include/uapi/linux/lustre/lustre_user.h @@ -1598,8 +1598,9 @@ struct if_quotactl { #define SWAP_LAYOUTS_KEEP_ATIME (1 << 3) #define SWAP_LAYOUTS_CLOSE (1 << 4) -/* Swap XATTR_NAME_HSM as well, only on the MDT so far */ -#define SWAP_LAYOUTS_MDS_HSM (1 << 31) +/* Skip the UID/GID check before a swap layout for a release (server only) */ +#define SWAP_LAYOUTS_MDS_RELEASE (1 << 31) + struct lustre_swap_layouts { __u64 sl_flags; __u32 sl_fd; diff --git a/lustre/llite/file.c b/lustre/llite/file.c index ffc16d2..d298cbf 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -67,6 +67,12 @@ struct pcc_param { __u32 pa_layout_gen; }; +struct swap_layouts_param { + struct inode *slp_inode; + __u64 slp_dv1; + __u64 slp_dv2; +}; + static int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg); @@ -143,8 +149,9 @@ static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data, * The meaning of "data" depends on the value of "bias". * * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version. - * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to - * swap layouts with. + * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to a + * struct swap_layouts_param containing the inode to swap with and the old and + * new dataversion */ static int ll_close_inode_openhandle(struct inode *inode, struct obd_client_handle *och, @@ -177,8 +184,7 @@ static int ll_close_inode_openhandle(struct inode *inode, op_data->op_attr.ia_valid |= ATTR_SIZE; op_data->op_xvalid |= OP_XVALID_BLOCKS; fallthrough; - case MDS_CLOSE_LAYOUT_SPLIT: - case MDS_CLOSE_LAYOUT_SWAP: { + case MDS_CLOSE_LAYOUT_SPLIT: { struct split_param *sp = data; LASSERT(data != NULL); @@ -188,11 +194,22 @@ static int ll_close_inode_openhandle(struct inode *inode, if (bias == MDS_CLOSE_LAYOUT_SPLIT) { op_data->op_fid2 = *ll_inode2fid(sp->sp_inode); op_data->op_mirror_id = sp->sp_mirror_id; - } else { + } else { /* MDS_CLOSE_LAYOUT_MERGE */ op_data->op_fid2 = *ll_inode2fid(data); } break; } + case MDS_CLOSE_LAYOUT_SWAP: { + struct swap_layouts_param *slp = data; + + LASSERT(data != NULL); + op_data->op_bias |= (bias | MDS_CLOSE_LAYOUT_SWAP_HSM); + op_data->op_lease_handle = och->och_lease_handle; + op_data->op_fid2 = *ll_inode2fid(slp->slp_inode); + op_data->op_data_version = slp->slp_dv1; + op_data->op_data_version2 = slp->slp_dv2; + break; + } case MDS_CLOSE_RESYNC_DONE: { struct ll_ioc_lease *ioc = data; @@ -1325,11 +1342,13 @@ static int ll_check_swap_layouts_validity(struct inode *inode1, } static int ll_swap_layouts_close(struct obd_client_handle *och, - struct inode *inode, struct inode *inode2) + struct inode *inode, struct inode *inode2, + struct lustre_swap_layouts *lsl) { - const struct lu_fid *fid1 = ll_inode2fid(inode); - const struct lu_fid *fid2; - int rc; + const struct lu_fid *fid1 = ll_inode2fid(inode); + struct swap_layouts_param slp; + const struct lu_fid *fid2; + int rc; ENTRY; CDEBUG(D_INODE, "%s: biased close of file "DFID"\n", @@ -1349,8 +1368,10 @@ static int ll_swap_layouts_close(struct obd_client_handle *och, /* Close the file and {swap,merge} layouts between inode & inode2. * NB: local lease handle is released in mdc_close_intent_pack() * because we still need it to pack l_remote_handle to MDT. */ - rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP, - inode2); + slp.slp_inode = inode2; + slp.slp_dv1 = lsl->sl_dv1; + slp.slp_dv2 = lsl->sl_dv2; + rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP, &slp); och = NULL; /* freed in ll_close_inode_openhandle() */ @@ -4423,7 +4444,7 @@ ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (och == NULL) GOTO(out, rc = -ENOLCK); inode2 = file_inode(file2); - rc = ll_swap_layouts_close(och, inode, inode2); + rc = ll_swap_layouts_close(och, inode, inode2, &lsl); } else { rc = ll_swap_layouts(file, file2, &lsl); } diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c index ff56a6f..a60396c 100644 --- a/lustre/mdc/mdc_lib.c +++ b/lustre/mdc/mdc_lib.c @@ -566,6 +566,8 @@ static void mdc_close_intent_pack(struct req_capsule *pill, } } else if (bias & MDS_PCC_ATTACH) { data->cd_archive_id = op_data->op_archive_id; + } else if (bias & MDS_CLOSE_LAYOUT_SWAP) { + data->cd_data_version2 = op_data->op_data_version2; } } diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c index 07db69a..3222e49 100644 --- a/lustre/mdd/mdd_object.c +++ b/lustre/mdd/mdd_object.c @@ -2289,31 +2289,28 @@ repeat: RETURN(0); } -static int mdd_xattr_hsm_replace(const struct lu_env *env, - struct mdd_object *o, struct lu_buf *buf, - struct thandle *handle) +static int emit_changelog_after_swap_layout(const struct lu_env *env, + struct thandle *handle, + struct mdd_object *o, + struct lu_buf *hsm_buf) { - struct hsm_attrs *attrs; + enum changelog_rec_flags flags = 0; + enum changelog_rec_type type; enum hsm_states hsm_flags; - enum changelog_rec_flags clf_flags = 0; - int rc; - ENTRY; - - rc = mdo_xattr_set(env, o, buf, XATTR_NAME_HSM, LU_XATTR_REPLACE, - handle); - if (rc != 0) - RETURN(rc); + struct hsm_attrs *attrs; - attrs = buf->lb_buf; + attrs = hsm_buf->lb_buf; hsm_flags = le32_to_cpu(attrs->hsm_flags); - if (!(hsm_flags & HS_RELEASED) || mdd_is_dead_obj(o)) - RETURN(0); - /* Add a changelog record for release. */ - hsm_set_cl_event(&clf_flags, HE_RELEASE); - rc = mdd_changelog_data_store(env, mdo2mdd(&o->mod_obj), CL_HSM, - clf_flags, o, handle, NULL); - RETURN(rc); + if ((hsm_flags & HS_RELEASED) && !mdd_is_dead_obj(o)) { + hsm_set_cl_event(&flags, HE_RELEASE); + type = CL_HSM; + } else { + type = CL_LAYOUT; + } + + return mdd_changelog_data_store(env, mdo2mdd(&o->mod_obj), type, + flags, o, handle, NULL); } /* @@ -2353,7 +2350,9 @@ static int mdd_layout_swap_allowed(const struct lu_env *env, RETURN(-EBADF); } - if (flags & SWAP_LAYOUTS_MDS_HSM) + /* Do not check uid/gid for release since the orphan is created as root + */ + if (flags & SWAP_LAYOUTS_MDS_RELEASE) RETURN(0); if ((attr1->la_uid != attr2->la_uid) || @@ -2527,11 +2526,200 @@ out: return rc; } +/* Swap Layout HSM object information: every information needed to decide how + * to update the HSM xattr flag during a layout swap. + */ +struct sl_hsm_object_info { + struct mdd_object *o; /* the object whose HSM attribute to check */ + struct lu_buf *hsm_buf; /* buffer initialized with the content of the + * HSM xattr. + */ + int xattr_flags; /* 0: do not update HSM xattr + * LU_XATTR_REPLACE or LU_XATTR_CREATE: update + * the xattr with this flag + */ + __u64 dv; /* dataversion of the object */ +}; + +/* Read the HSM xattr and store its content into \p hsm_buf. */ +static int fetch_hsm_xattr(const struct lu_env *env, struct mdd_object *o, + struct lu_buf *hsm_buf) +{ + int rc; + + lu_buf_alloc(hsm_buf, sizeof(struct hsm_attrs)); + if (hsm_buf->lb_buf == NULL) + return -ENOMEM; + + rc = mdo_xattr_get(env, o, hsm_buf, XATTR_NAME_HSM); + if (rc < 0) + return rc; + + return 0; +} + +/* return true if HSM xattr needs update */ +static bool swap_hsm_set_dirty(struct lu_buf *out_buf, struct lu_buf *src_buf) +{ + struct md_hsm src_hsm; + + lustre_buf2hsm(src_buf->lb_buf, src_buf->lb_len, &src_hsm); + if (!(src_hsm.mh_flags & (HS_ARCHIVED | HS_EXISTS)) || + src_hsm.mh_flags & HS_DIRTY) + /* do not update HSM attr of non archived or dirty files */ + return false; + + src_hsm.mh_flags |= HS_DIRTY; + lustre_hsm2buf(out_buf->lb_buf, &src_hsm); + + return true; +} + +/* return 1 HSM xattr needs update + * 0 do not update HSM xattr + * -EPERM data version mismatch, no swap + */ +static int swap_hsm_update_version(struct lu_buf *out_buf, __u64 out_dv, + struct lu_buf *src_buf, __u64 src_dv) +{ + struct md_hsm src_hsm; + + /* Put lb_len as a second argument since we know that src_buf is a valid + * HSM xattr buffer. + */ + lustre_buf2hsm(src_buf->lb_buf, src_buf->lb_len, &src_hsm); + + if (!(src_hsm.mh_flags & (HS_ARCHIVED | HS_EXISTS)) || + src_hsm.mh_flags & HS_DIRTY) + return 0; + + /* migration with old client -> set the dirty flag */ + if (!src_dv || !out_dv) { + src_hsm.mh_flags |= HS_DIRTY; + goto hsm2buf; + } + + if (src_hsm.mh_arch_ver != src_dv) { + CDEBUG(D_HSM, + "HSM archive version and previous data version mismatch (arch_ver=%llu, prev_ver=%llu, new_ver=%llu)\n", + src_hsm.mh_arch_ver, src_dv, out_dv); + return -EPERM; + } + + src_hsm.mh_arch_ver = out_dv; +hsm2buf: + lustre_hsm2buf(out_buf->lb_buf, &src_hsm); + + return 1; +} + +/* Allow HSM xattr swap only with volatile/orphan files. + * This is used by HSM release/restore. + */ +static inline bool swap_hsm_xattr_allowed(bool fst_has_hsm, bool snd_has_hsm, + unsigned long o_fst_flag, + unsigned long o_snd_flag) +{ + return (fst_has_hsm && snd_has_hsm && + ((o_fst_flag | o_snd_flag) & (ORPHAN_OBJ | VOLATILE_OBJ))); +} + +/* Read and update the data version of both objects if necessary. If they have + * to be updated, declare the update. + * \p xattr_flags will be updated if necessary to be used by mdo_xattr_set + * + * \p dv contains the data version of the file before and after the migration. + * It can be NULL when swapping layouts in other contexts (HSM or + * lfs swap_layout). + */ +static int swap_layouts_prepare_hsm_attr(const struct lu_env *env, + struct mdd_device *mdd, + struct thandle *handle, + struct sl_hsm_object_info *fst, + struct sl_hsm_object_info *snd) +{ + unsigned long o_fst_fl; + unsigned long o_snd_fl; + int rc2; + int rc; + + fst->xattr_flags = 0; + snd->xattr_flags = 0; + + rc = fetch_hsm_xattr(env, fst->o, fst->hsm_buf); + if (rc != -ENODATA && rc != 0) + return rc; + + rc2 = fetch_hsm_xattr(env, snd->o, snd->hsm_buf); + if (rc2 != -ENODATA && rc2 != 0) + return rc2; + + /* if nothing to swap, not an error */ + if (rc && rc2) + return 0; + + /* swap if the first object have no HSM xattr */ + if (rc && !rc2) { + swap(fst, snd); + swap(rc, rc2); + } + + o_fst_fl = fst->o->mod_flags; + o_snd_fl = snd->o->mod_flags; + + if ((o_snd_fl & VOLATILE_OBJ) && rc2) { + /* migration of fst */ + rc = swap_hsm_update_version(snd->hsm_buf, snd->dv, + fst->hsm_buf, fst->dv); + if (rc == 0) + return 0; + if (rc < 0) + return rc; + + fst->xattr_flags = LU_XATTR_REPLACE; + + } else if (swap_hsm_xattr_allowed(!rc, !rc2, o_fst_fl, o_snd_fl)) { + /* HSM release/restore -> HSM xattr swap */ + fst->xattr_flags = LU_XATTR_REPLACE; + snd->xattr_flags = LU_XATTR_REPLACE; + + } else if (!rc && !rc2) { + /* swap on 2 archived files is not supported (no rollback) */ + return -EPERM; + + } else if (rc2) { + /* swap layout with HSM fst and non-HSM snd */ + if (!swap_hsm_set_dirty(snd->hsm_buf, fst->hsm_buf)) + return 0; + + fst->xattr_flags = LU_XATTR_REPLACE; + } + + if (fst->xattr_flags) { + rc = mdd_declare_xattr_set(env, mdd, fst->o, snd->hsm_buf, + XATTR_NAME_HSM, fst->xattr_flags, + handle); + if (rc < 0) + return rc; + } + + if (snd->xattr_flags) { + rc = mdd_declare_xattr_set(env, mdd, snd->o, fst->hsm_buf, + XATTR_NAME_HSM, snd->xattr_flags, + handle); + if (rc < 0) + return rc; + } + + return 0; +} + /** * swap layouts between 2 lustre objects */ -static int mdd_swap_layouts(const struct lu_env *env, struct md_object *obj1, - struct md_object *obj2, __u64 flags) +static int mdd_swap_layouts(const struct lu_env *env, + struct md_object *obj1, struct md_object *obj2, + __u64 dv1, __u64 dv2, __u64 flags) { struct mdd_thread_info *info = mdd_env_info(env); struct mdd_object *fst_o = md2mdd_obj(obj1); @@ -2540,18 +2728,25 @@ static int mdd_swap_layouts(const struct lu_env *env, struct md_object *obj1, struct lu_attr *snd_la = MDD_ENV_VAR(env, tattr); struct mdd_device *mdd = mdo2mdd(obj1); struct lov_mds_md *fst_lmm, *snd_lmm; - struct lu_buf *fst_buf; - struct lu_buf *snd_buf; + struct sl_hsm_object_info fst_info; + struct sl_hsm_object_info snd_info; + struct mdd_object *vlt_o = NULL; + struct mdd_object *dom_o = NULL; + struct ost_id *saved_oi = NULL; struct lu_buf *fst_hsm_buf; struct lu_buf *snd_hsm_buf; - struct ost_id *saved_oi = NULL; + struct lu_buf *fst_buf; + struct lu_buf *snd_buf; struct thandle *handle; - struct mdd_object *dom_o = NULL, *vlt_o = NULL; - __u64 domsize_dom, domsize_vlt; - __u32 fst_gen, snd_gen, saved_gen; - int fst_fl; - int rc, rc2; + __u64 domsize_dom; + __u64 domsize_vlt; + __u32 saved_gen; int retried = 0; + __u32 fst_gen; + __u32 snd_gen; + int fst_fl; + int rc2; + int rc; ENTRY; @@ -2570,8 +2765,10 @@ retry: if (rc == 0) /* same fid ? */ RETURN(-EPERM); - if (rc < 0) + if (rc < 0) { swap(fst_o, snd_o); + swap(dv1, dv2); + } rc = mdd_la_get(env, fst_o, fst_la); if (rc != 0) @@ -2652,6 +2849,7 @@ retry: if (snd_buf->lb_buf == NULL) { swap(fst_o, snd_o); swap(fst_buf, snd_buf); + swap(dv1, dv2); } fst_gen = snd_gen = 0; @@ -2710,36 +2908,17 @@ retry: } mdd_set_lmm_gen(snd_lmm, &fst_gen); - /* Prepare HSM attribute if it's required */ - if (flags & SWAP_LAYOUTS_MDS_HSM) { - const int buflen = sizeof(struct hsm_attrs); - - lu_buf_alloc(fst_hsm_buf, buflen); - lu_buf_alloc(snd_hsm_buf, buflen); - if (fst_hsm_buf->lb_buf == NULL || snd_hsm_buf->lb_buf == NULL) - GOTO(stop, rc = -ENOMEM); + fst_info.o = fst_o; + fst_info.hsm_buf = fst_hsm_buf; + fst_info.dv = dv1; - /* Read HSM attribute */ - rc = mdo_xattr_get(env, fst_o, fst_hsm_buf, XATTR_NAME_HSM); - if (rc < 0) - GOTO(stop, rc); - - rc = mdo_xattr_get(env, snd_o, snd_hsm_buf, XATTR_NAME_HSM); - if (rc < 0) - GOTO(stop, rc); - - rc = mdd_declare_xattr_set(env, mdd, fst_o, snd_hsm_buf, - XATTR_NAME_HSM, LU_XATTR_REPLACE, - handle); - if (rc < 0) - GOTO(stop, rc); - - rc = mdd_declare_xattr_set(env, mdd, snd_o, fst_hsm_buf, - XATTR_NAME_HSM, LU_XATTR_REPLACE, - handle); - if (rc < 0) - GOTO(stop, rc); - } + snd_info.o = snd_o; + snd_info.hsm_buf = snd_hsm_buf; + snd_info.dv = dv2; + rc = swap_layouts_prepare_hsm_attr(env, mdd, handle, &fst_info, + &snd_info); + if (rc) + GOTO(stop, rc); /* prepare transaction */ rc = mdd_declare_xattr_set(env, mdd, fst_o, snd_buf, XATTR_NAME_LOV, @@ -2770,26 +2949,25 @@ retry: if (!mdd_object_exists(snd_o)) GOTO(unlock, rc = -ENOENT); - if (flags & SWAP_LAYOUTS_MDS_HSM) { - rc = mdd_xattr_hsm_replace(env, fst_o, snd_hsm_buf, handle); + if (fst_info.xattr_flags) { + rc = mdo_xattr_set(env, fst_o, snd_info.hsm_buf, + XATTR_NAME_HSM, fst_info.xattr_flags, + handle); if (rc < 0) GOTO(unlock, rc); + } - rc = mdd_xattr_hsm_replace(env, snd_o, fst_hsm_buf, handle); - if (rc < 0) { - rc2 = mdd_xattr_hsm_replace(env, fst_o, fst_hsm_buf, - handle); - if (rc2 < 0) - CERROR("%s: HSM error restoring "DFID": rc = %d/%d\n", - mdd_obj_dev_name(fst_o), - PFID(mdd_object_fid(fst_o)), rc, rc2); - GOTO(unlock, rc); - } + if (snd_info.xattr_flags) { + rc = mdo_xattr_set(env, snd_o, fst_info.hsm_buf, + XATTR_NAME_HSM, snd_info.xattr_flags, + handle); + if (rc < 0) + GOTO(out_restore_hsm_fst, rc); } rc = mdo_xattr_set(env, fst_o, snd_buf, XATTR_NAME_LOV, fst_fl, handle); if (rc != 0) - GOTO(unlock, rc); + GOTO(out_restore_hsm, rc); if (unlikely(CFS_FAIL_CHECK(OBD_FAIL_MDS_HSM_SWAP_LAYOUTS))) { rc = -EOPNOTSUPP; @@ -2804,13 +2982,11 @@ retry: GOTO(out_restore, rc); /* Issue one changelog record per file */ - rc = mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, fst_o, handle, - NULL); + rc = emit_changelog_after_swap_layout(env, handle, fst_o, fst_hsm_buf); if (rc) GOTO(unlock, rc); - rc = mdd_changelog_data_store(env, mdd, CL_LAYOUT, 0, snd_o, handle, - NULL); + rc = emit_changelog_after_swap_layout(env, handle, snd_o, snd_hsm_buf); if (rc) GOTO(unlock, rc); EXIT; @@ -2832,16 +3008,23 @@ out_restore: if (rc2 < 0) goto do_lbug; - if (flags & SWAP_LAYOUTS_MDS_HSM) { - ++steps; - rc2 = mdd_xattr_hsm_replace(env, fst_o, fst_hsm_buf, - handle); +out_restore_hsm: + if (snd_info.xattr_flags) { + /* roll back swap HSM */ + steps = 1; + rc2 = mdo_xattr_set(env, snd_o, snd_hsm_buf, + XATTR_NAME_HSM, LU_XATTR_REPLACE, + handle); if (rc2 < 0) goto do_lbug; + } - ++steps; - rc2 = mdd_xattr_hsm_replace(env, snd_o, snd_hsm_buf, - handle); +out_restore_hsm_fst: + if (fst_info.xattr_flags) { + steps = 2; + rc2 = mdo_xattr_set(env, fst_o, fst_hsm_buf, + XATTR_NAME_HSM, LU_XATTR_REPLACE, + handle); } do_lbug: diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c index 7410207..ee4dd71 100644 --- a/lustre/mdt/mdt_coordinator.c +++ b/lustre/mdt/mdt_coordinator.c @@ -1451,20 +1451,20 @@ static int hsm_swap_layouts(struct mdt_thread_info *mti, */ mh_common->mh_flags &= ~(HS_RELEASED | HS_DIRTY); rc = mdt_hsm_attr_set(mti, dobj, mh_common); - if (rc == 0) - rc = mo_swap_layouts(mti->mti_env, - mdt_object_child(obj), - mdt_object_child(dobj), - SWAP_LAYOUTS_MDS_HSM); - if (rc == 0) { - rc = mdt_lsom_downgrade(mti, obj); - if (rc) - CDEBUG(D_INODE, - "%s: File fid="DFID" SOM " - "downgrade failed, rc = %d\n", - mdt_obd_name(mti->mti_mdt), - PFID(mdt_object_fid(obj)), rc); - } + if (rc) + GOTO(out_dobj, rc); + + rc = mo_swap_layouts(mti->mti_env, mdt_object_child(obj), + mdt_object_child(dobj), 0, 0, 0); + if (rc) + GOTO(out_dobj, rc); + + rc = mdt_lsom_downgrade(mti, obj); + if (rc) + CDEBUG(D_INODE, + "%s: File fid="DFID" SOM downgrade failed, rc = %d\n", + mdt_obd_name(mti->mti_mdt), + PFID(mdt_object_fid(obj)), rc); out_dobj: mdt_object_unlock_put(mti, dobj, dlh, 1); out: diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 19e05d0..ce05eb3 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -1836,7 +1836,7 @@ static int mdt_swap_layouts(struct tgt_session_info *tsi) struct mdt_object *o1, *o2, *o; struct mdt_lock_handle *lh1, *lh2; struct mdc_swap_layouts *msl; - int rc; + int rc; ENTRY; @@ -1904,7 +1904,7 @@ static int mdt_swap_layouts(struct tgt_session_info *tsi) GOTO(unlock1, rc); rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1), - mdt_object_child(o2), msl->msl_flags); + mdt_object_child(o2), 0, 0, msl->msl_flags); if (rc < 0) GOTO(unlock2, rc); diff --git a/lustre/mdt/mdt_lib.c b/lustre/mdt/mdt_lib.c index 3bb864f..13cf30c 100644 --- a/lustre/mdt/mdt_lib.c +++ b/lustre/mdt/mdt_lib.c @@ -1217,7 +1217,7 @@ static int mdt_setattr_unpack_rec(struct mdt_thread_info *info) ma->ma_attr_flags |= rec->sa_bias & (MDS_CLOSE_INTENT | MDS_DATA_MODIFIED | MDS_TRUNC_KEEP_LEASE | - MDS_PCC_ATTACH); + MDS_PCC_ATTACH | MDS_CLOSE_LAYOUT_SWAP_HSM); RETURN(0); } diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index de0cce2..856a91b 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -1886,6 +1886,9 @@ static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o, if (data == NULL) RETURN(-EPROTO); + if (req_capsule_req_need_swab(info->mti_pill)) + lustre_swab_close_data_special(data, ma->ma_attr_flags); + lease = ldlm_handle2lock(&data->cd_handle); if (lease == NULL) RETURN(-ESTALE); @@ -2074,8 +2077,8 @@ static int mdt_hsm_release(struct mdt_thread_info *info, struct mdt_object *o, /* Swap layout with orphan objects. */ rc = mo_swap_layouts(info->mti_env, mdt_object_child(o), - mdt_object_child(orphan), - SWAP_LAYOUTS_MDS_HSM); + mdt_object_child(orphan), 0, 0, + SWAP_LAYOUTS_MDS_RELEASE); if (!rc && ma->ma_attr_flags & MDS_PCC_ATTACH) { ma->ma_need = MA_LOV; @@ -2151,6 +2154,9 @@ static int mdt_close_handle_layouts(struct mdt_thread_info *info, if (data == NULL) RETURN(-EPROTO); + if (req_capsule_req_need_swab(info->mti_pill)) + lustre_swab_close_data_special(data, ma->ma_attr_flags); + if (fid_is_zero(&data->cd_fid) || !fid_is_sane(&data->cd_fid)) RETURN(-EINVAL); @@ -2238,8 +2244,22 @@ static int mdt_close_handle_layouts(struct mdt_thread_info *info, /* Swap layout with orphan object */ if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SWAP) { + __u64 dv1 = data->cd_data_version; + __u64 dv2 = 0; + + if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SWAP_HSM) + /* Compat: new clients send new dataversion in + * cd_data_version2 and old one in cd_data_version. + * Old clients sent cd_data_version = 0 and no + * cd_data_version2. + */ + dv2 = data->cd_data_version2; + + if (swap_objects) + swap(dv1, dv2); + rc = mo_swap_layouts(info->mti_env, mdt_object_child(o1), - mdt_object_child(o2), 0); + mdt_object_child(o2), dv1, dv2, 0); } else if (ma->ma_attr_flags & MDS_CLOSE_LAYOUT_MERGE || ma->ma_attr_flags & MDS_CLOSE_LAYOUT_SPLIT) { struct lu_buf *buf = &info->mti_buf; @@ -2365,7 +2385,7 @@ static int mdt_close_resync_done(struct mdt_thread_info *info, RETURN(-EPROTO); if (req_capsule_req_need_swab(info->mti_pill)) - lustre_swab_close_data_resync_done(&data->cd_resync); + lustre_swab_close_data_special(data, ma->ma_attr_flags); if (!fid_is_zero(&data->cd_fid)) RETURN(-EPROTO); diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 08f92b8..5d491b3 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -3059,6 +3059,8 @@ void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl) void lustre_swab_close_data(struct close_data *cd) { + + __swab64s(&cd->cd_handle.cookie); lustre_swab_lu_fid(&cd->cd_fid); __swab64s(&cd->cd_data_version); } @@ -3074,7 +3076,19 @@ void lustre_swab_close_data_resync_done(struct close_data_resync_done *resync) __swab32s(&resync->resync_ids_inline[i]); } } -EXPORT_SYMBOL(lustre_swab_close_data_resync_done); + +void lustre_swab_close_data_special(struct close_data *cd, enum mds_op_bias b) +{ + if (b & MDS_CLOSE_RESYNC_DONE) + lustre_swab_close_data_resync_done(&cd->cd_resync); + else if (b & MDS_CLOSE_LAYOUT_SPLIT) + __swab16s(&cd->cd_mirror_id); + else if (b & MDS_PCC_ATTACH) + swab32s(&cd->cd_archive_id); + else if (b & MDS_CLOSE_LAYOUT_SWAP) + swab64s(&cd->cd_data_version2); +} +EXPORT_SYMBOL(lustre_swab_close_data_special); void lustre_swab_lfsck_request(struct lfsck_request *lr) { diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index bd0e42f..196fa6e 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -5896,6 +5896,133 @@ test_606() { } run_test 606 "llog_reader groks changelog fields" +get_hsm_xattr_sha() +{ + getfattr -e text -n trusted.hsm "$1" 2>/dev/null | + sha1sum | awk '{ print $1 }' +} + +test_hsm_migrate_init() +{ + local d=$1 + local f=$2 + local fid + + mkdir_on_mdt0 "$d" + fid=$(create_small_file "$f") + + echo "$fid" +} + +test_607a() +{ + local d="$DIR/$tdir" + local f="$d/$tfile" + local fid + + (( MDS1_VERSION >= $(version_code 2.15.60) )) || + skip "need MDS version at least 2.15.60" + + (( OSTCOUNT >= 2 )) || skip_env "needs >= 2 OSTs" + + fid=$(test_hsm_migrate_init "$d" "$f" | tail -1) + + copytool setup + + $LFS hsm_archive "$f" || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + $LFS migrate -n -i 1 "$f" || + error "could not migrate file to OST 1" + + $LFS hsm_release "$f" || + error "could not release file after non blocking migrate" + $LFS hsm_restore "$f" || + error "could not restore file after non blocking migrate" + wait_request_state $fid RESTORE SUCCEED +} +run_test 607a "release a file that was migrated after being archived" + +test_607b() +{ + local d="$DIR/$tdir" + local f="$DIR/$tdir/$tfile" + local saved_params + local old_hsm + local new_hsm + local fid + + (( MDS1_VERSION >= $(version_code 2.15.60) )) || + skip "need MDS version at least 2.15.60" + + (( OSTCOUNT >= 2 )) || skip_env "needs >= 2 OSTs" + + fid=$(test_hsm_migrate_init "$d" "$f" | tail -1) + + copytool setup + + $LFS hsm_archive "$f" || error "could not archive file" + wait_request_state $fid ARCHIVE SUCCEED + + saved_params=$($LCTL get_param llite.*.xattr_cache | tr '\n' ' ') + $LCTL set_param llite.*.xattr_cache=0 + stack_trap "$LCTL set_param $saved_params" EXIT + + # make sure that migrate won't change archive version + echo 10 >> "$f" + + old_hsm=$(get_hsm_xattr_sha "$f") + $LFS migrate -n -i 1 "$f" || + error "could not migrate file to OST 1" + + $LFS hsm_state "$f" | grep dirty || error "dirty flag not found" + + new_hsm=$(get_hsm_xattr_sha "$f") + [ "$old_hsm" != "$new_hsm" ] && + error "migrate should not modify data version of dirty files" + + return 0 +} +run_test 607b "Migrate should not change the HSM attribute of dirty files" + +test_607c() +{ + local d="$DIR/$tdir" + local f="$DIR/$tdir/$tfile" + local fid1 fid2 fid3 + local nbr_dirty + + (( MDS1_VERSION >= $(version_code 2.15.60) )) || + skip "need MDS version at least 2.15.60" + + mkdir_on_mdt0 $d + fid1=$(create_small_file "$f-1") + fid2=$(create_small_file "$f-2") + fid3=$(create_small_file "$f-3") + + copytool setup + + $LFS hsm_archive "$f-1" || error "could not archive file" + wait_request_state $fid1 ARCHIVE SUCCEED + + $LFS hsm_archive "$f-3" || error "could not archive file" + wait_request_state $fid3 ARCHIVE SUCCEED + + $LFS swap_layouts "$f-1" "$f-3" |& grep "Operation not permitted" || + error "swap_layouts should fail with EPERM on 2 archived file" + + $LFS swap_layouts "$f-1" "$f-2" || + error "swap_layout failed on $f-1 and $f-2" + + $LFS swap_layouts "$f-2" "$f-3" || + error "swap_layout failed on $f-2 and $f-3" + + nbr_dirty=$($LFS hsm_state "$f-1" "$f-3" | grep -c 'dirty') + ((nbr_dirty == 2)) || error "dirty flag should be set on $f-1 and $f-3" + +} +run_test 607c "'lfs swap_layouts' should set dirty flag on HSM file" + complete_test $SECONDS check_and_cleanup_lustre exit_status diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index ae0d049..db45753 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -1092,12 +1092,13 @@ static int check_lease(int fd) static int migrate_nonblock(int fd_src, int fd_dst, unsigned long long bandwidth_bytes_sec, - long stats_interval_sec) + long stats_interval_sec, + __u64 *dv_src) { struct stat st; - __u64 dv1; - __u64 dv2; - int rc; + __u64 dv1; + __u64 dv2; + int rc; rc = fstat(fd_src, &st); if (rc < 0) { @@ -1125,6 +1126,9 @@ static int migrate_nonblock(int fd_src, int fd_dst, return rc; } + if (dv_src) + *dv_src = dv2; + if (dv1 != dv2) { rc = -EAGAIN; error_loc = "source file changed"; @@ -1324,6 +1328,8 @@ static int lfs_migrate(char *name, __u64 migration_flags, { struct llapi_layout *existing; uint64_t dom_new, dom_cur; + __u64 dv_src = 0; + __u64 dv_dst = 0; int fd_src = -1; int fd_dst = -1; int rc; @@ -1388,18 +1394,24 @@ static int lfs_migrate(char *name, __u64 migration_flags, } rc = migrate_nonblock(fd_src, fd_dst, bandwidth_bytes_sec, - stats_interval_sec); + stats_interval_sec, &dv_src); if (rc < 0) { llapi_lease_release(fd_src); goto out; } + rc = llapi_get_data_version(fd_dst, &dv_dst, LL_DV_RD_FLUSH); + if (rc != 0) { + error_loc = "cannot get data version"; + return rc; + } /* * Atomically put lease, swap layouts and close. * for a migration we need to check data version on file did * not change. */ - rc = llapi_fswap_layouts(fd_src, fd_dst, 0, 0, SWAP_LAYOUTS_CLOSE); + rc = llapi_fswap_layouts(fd_src, fd_dst, dv_src, dv_dst, + SWAP_LAYOUTS_CLOSE); if (rc < 0) { error_loc = "cannot swap layout"; goto out; @@ -1930,9 +1942,9 @@ static int mirror_extend_layout(char *name, struct llapi_layout *m_layout, { struct llapi_layout *f_layout = NULL; struct ll_ioc_lease *data = NULL; - struct stat st; int fd_src = -1; int fd_dst = -1; + struct stat st; int rc = 0; if (inherit) { @@ -1980,7 +1992,7 @@ static int mirror_extend_layout(char *name, struct llapi_layout *m_layout, printf("%s:\n", name); rc = migrate_nonblock(fd_src, fd_dst, bandwidth_bytes_sec, - stats_interval_sec); + stats_interval_sec, NULL); if (rc < 0) { llapi_lease_release(fd_src); goto out; @@ -2505,7 +2517,7 @@ static int lfs_migrate_to_dom(int fd_src, int fd_dst, char *name, printf("%s:\n", name); rc = migrate_nonblock(fd_src, fd_dst, bandwidth_bytes_sec, - stats_interval_sec); + stats_interval_sec, NULL); if (rc < 0) goto out_release; -- 1.8.3.1