From: jcl Date: Thu, 4 Oct 2012 19:51:49 +0000 (+0200) Subject: LU-2016 mdd: add layout swap between 2 objects X-Git-Tag: 2.3.58~46 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=c474bf9f9b9888f6056ea1832edb6a73a7607645 LU-2016 mdd: add layout swap between 2 objects This patch add a new method in mdd to swap the layouts between 2 lustre objects. The 2 objects have to be of the same type. Change-Id: I26dfef2745eac67168aeceac196453c5126148fd Signed-off-by: JC Lafoucriere Reviewed-on: http://review.whamcloud.com/4189 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Johann Lombardi Reviewed-by: Andreas Dilger --- diff --git a/lustre/include/md_object.h b/lustre/include/md_object.h index ce8f875..e907bf9 100644 --- a/lustre/include/md_object.h +++ b/lustre/include/md_object.h @@ -231,6 +231,11 @@ struct md_object_operations { int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj, const char *name); + /** This method is used to swap the layouts between 2 objects */ + int (*moo_swap_layouts)(const struct lu_env *env, + struct md_object *obj1, struct md_object *obj2, + __u64 flags); + /** \retval number of bytes actually read upon success */ int (*moo_readpage)(const struct lu_env *env, struct md_object *obj, const struct lu_rdpg *rdpg); @@ -599,6 +604,17 @@ static inline int mo_xattr_list(const struct lu_env *env, return m->mo_ops->moo_xattr_list(env, m, buf); } +static inline int mo_swap_layouts(const struct lu_env *env, + struct md_object *o1, + struct md_object *o2, __u64 flags) +{ + LASSERT(o1->mo_ops->moo_swap_layouts); + LASSERT(o2->mo_ops->moo_swap_layouts); + if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts) + return -EPERM; + return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags); +} + static inline int mo_open(const struct lu_env *env, struct md_object *m, int flags) diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index ae344c5..95d1cf8 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -422,9 +422,11 @@ static int lod_declare_xattr_set(const struct lu_env *env, * allow to declare predefined striping on a new (!mode) object * which is supposed to be replay of regular file creation * (when LOV setting is declared) + * LU_XATTR_REPLACE is set to indicate a layout swap */ mode = dt->do_lu.lo_header->loh_attr & S_IFMT; - if ((S_ISREG(mode) || !mode) && !strcmp(name, XATTR_NAME_LOV)) { + if ((S_ISREG(mode) || !mode) && !strcmp(name, XATTR_NAME_LOV) && + !(fl & LU_XATTR_REPLACE)) { /* * this is a request to manipulate object's striping */ @@ -509,9 +511,9 @@ static int lod_xattr_set(const struct lu_env *env, const char *name, int fl, struct thandle *th, struct lustre_capa *capa) { - struct dt_object *next = dt_object_child(dt); - __u32 attr; - int rc; + struct dt_object *next = dt_object_child(dt); + __u32 attr; + int rc; ENTRY; attr = dt->do_lu.lo_header->loh_attr & S_IFMT; @@ -523,12 +525,15 @@ static int lod_xattr_set(const struct lu_env *env, rc = dt_xattr_set(env, next, buf, name, fl, th, capa); } else if (S_ISREG(attr) && !strcmp(name, XATTR_NAME_LOV)) { - /* - * XXX: check striping match what we already have - * during req replay, declare_xattr_set() defines striping, - * then create() does the work - */ - rc = lod_striping_create(env, dt, NULL, NULL, th); + /* in case of lov EA swap, just set it + * if not, it is a replay so check striping match what we + * already have during req replay, declare_xattr_set() + * defines striping, then create() does the work + */ + if (fl & LU_XATTR_REPLACE) + rc = dt_xattr_set(env, next, buf, name, fl, th, capa); + else + rc = lod_striping_create(env, dt, NULL, NULL, th); RETURN(rc); } else { /* diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c index ebefc2f..2670c1f 100644 --- a/lustre/mdd/mdd_device.c +++ b/lustre/mdd/mdd_device.c @@ -637,6 +637,14 @@ static int dot_lustre_mdd_xattr_del(const struct lu_env *env, return -EPERM; } +static int dot_lustre_mdd_swap_layouts(const struct lu_env *env, + struct md_object *obj1, + struct md_object *obj2, + __u64 flags) +{ + return -EPERM; +} + static int dot_lustre_mdd_readlink(const struct lu_env *env, struct md_object *obj, struct lu_buf *buf) { @@ -715,25 +723,26 @@ static int dot_file_unlock(const struct lu_env *env, struct md_object *obj, } static struct md_object_operations mdd_dot_lustre_obj_ops = { - .moo_permission = dot_lustre_mdd_permission, - .moo_attr_get = mdd_attr_get, - .moo_attr_set = mdd_attr_set, - .moo_xattr_get = dot_lustre_mdd_xattr_get, - .moo_xattr_list = dot_lustre_mdd_xattr_list, - .moo_xattr_set = dot_lustre_mdd_xattr_set, - .moo_xattr_del = dot_lustre_mdd_xattr_del, - .moo_readpage = mdd_readpage, - .moo_readlink = dot_lustre_mdd_readlink, - .moo_object_create = dot_lustre_mdd_object_create, - .moo_ref_add = dot_lustre_mdd_ref_add, - .moo_ref_del = dot_lustre_mdd_ref_del, - .moo_open = dot_lustre_mdd_open, - .moo_close = dot_lustre_mdd_close, - .moo_capa_get = mdd_capa_get, - .moo_object_sync = dot_lustre_mdd_object_sync, - .moo_path = dot_lustre_mdd_path, - .moo_file_lock = dot_file_lock, - .moo_file_unlock = dot_file_unlock, + .moo_permission = dot_lustre_mdd_permission, + .moo_attr_get = mdd_attr_get, + .moo_attr_set = mdd_attr_set, + .moo_xattr_get = dot_lustre_mdd_xattr_get, + .moo_xattr_list = dot_lustre_mdd_xattr_list, + .moo_xattr_set = dot_lustre_mdd_xattr_set, + .moo_xattr_del = dot_lustre_mdd_xattr_del, + .moo_swap_layouts = dot_lustre_mdd_swap_layouts, + .moo_readpage = mdd_readpage, + .moo_readlink = dot_lustre_mdd_readlink, + .moo_object_create = dot_lustre_mdd_object_create, + .moo_ref_add = dot_lustre_mdd_ref_add, + .moo_ref_del = dot_lustre_mdd_ref_del, + .moo_open = dot_lustre_mdd_open, + .moo_close = dot_lustre_mdd_close, + .moo_capa_get = mdd_capa_get, + .moo_object_sync = dot_lustre_mdd_object_sync, + .moo_path = dot_lustre_mdd_path, + .moo_file_lock = dot_file_lock, + .moo_file_unlock = dot_file_unlock, }; diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c index d4eff75..9c3b9e4 100644 --- a/lustre/mdd/mdd_object.c +++ b/lustre/mdd/mdd_object.c @@ -1181,24 +1181,24 @@ static int mdd_xattr_sanity_check(const struct lu_env *env, } static int mdd_declare_xattr_set(const struct lu_env *env, - struct mdd_device *mdd, - struct mdd_object *obj, - const struct lu_buf *buf, - const char *name, - struct thandle *handle) + struct mdd_device *mdd, + struct mdd_object *obj, + const struct lu_buf *buf, + const char *name, + int fl, struct thandle *handle) { - int rc; + int rc; - rc = mdo_declare_xattr_set(env, obj, buf, name, 0, handle); - if (rc) - return rc; + rc = mdo_declare_xattr_set(env, obj, buf, name, fl, handle); + if (rc) + return rc; - /* Only record user xattr changes */ - if ((strncmp("user.", name, 5) == 0)) - rc = mdd_declare_changelog_store(env, mdd, NULL, handle); + /* Only record user xattr changes */ + if ((strncmp("user.", name, 5) == 0)) + rc = mdd_declare_changelog_store(env, mdd, NULL, handle); rc = mdd_declare_changelog_store(env, mdd, NULL, handle); - return rc; + return rc; } /** @@ -1206,13 +1206,13 @@ static int mdd_declare_xattr_set(const struct lu_env *env, * after xattr_set if needed. */ static int mdd_xattr_set(const struct lu_env *env, struct md_object *obj, - const struct lu_buf *buf, const char *name, - int fl) + const struct lu_buf *buf, const char *name, + int fl) { - struct mdd_object *mdd_obj = md2mdd_obj(obj); - struct mdd_device *mdd = mdo2mdd(obj); - struct thandle *handle; - int rc; + struct mdd_object *mdd_obj = md2mdd_obj(obj); + struct mdd_device *mdd = mdo2mdd(obj); + struct thandle *handle; + int rc; ENTRY; if (!strcmp(name, XATTR_NAME_ACL_ACCESS)) { @@ -1220,25 +1220,25 @@ static int mdd_xattr_set(const struct lu_env *env, struct md_object *obj, RETURN(rc); } - rc = mdd_xattr_sanity_check(env, mdd_obj); - if (rc) - RETURN(rc); + rc = mdd_xattr_sanity_check(env, mdd_obj); + if (rc) + RETURN(rc); - handle = mdd_trans_create(env, mdd); - if (IS_ERR(handle)) - RETURN(PTR_ERR(handle)); + handle = mdd_trans_create(env, mdd); + if (IS_ERR(handle)) + RETURN(PTR_ERR(handle)); - rc = mdd_declare_xattr_set(env, mdd, mdd_obj, buf, name, handle); - if (rc) - GOTO(stop, rc); + rc = mdd_declare_xattr_set(env, mdd, mdd_obj, buf, name, 0, handle); + if (rc) + GOTO(stop, rc); - rc = mdd_trans_start(env, mdd, handle); - if (rc) - GOTO(stop, rc); + rc = mdd_trans_start(env, mdd, handle); + if (rc) + GOTO(stop, rc); - /* security-replated changes may require sync */ - if (!strcmp(name, XATTR_NAME_ACL_ACCESS)) - handle->th_sync |= !!mdd->mdd_sync_permission; + /* security-replated changes may require sync */ + if (!strcmp(name, XATTR_NAME_ACL_ACCESS)) + handle->th_sync |= !!mdd->mdd_sync_permission; mdd_write_lock(env, mdd_obj, MOR_TGT_CHILD); rc = mdo_xattr_set(env, mdd_obj, buf, name, fl, handle, @@ -1247,20 +1247,20 @@ static int mdd_xattr_set(const struct lu_env *env, struct md_object *obj, if (rc) GOTO(stop, rc); - /* Only record system & user xattr changes */ + /* Only record system & user xattr changes */ if (strncmp(XATTR_USER_PREFIX, name, - sizeof(XATTR_USER_PREFIX) - 1) == 0 || - strncmp(POSIX_ACL_XATTR_ACCESS, name, - sizeof(POSIX_ACL_XATTR_ACCESS) - 1) == 0 || - strncmp(POSIX_ACL_XATTR_DEFAULT, name, - sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) == 0) - rc = mdd_changelog_data_store(env, mdd, CL_XATTR, 0, mdd_obj, - handle); + sizeof(XATTR_USER_PREFIX) - 1) == 0 || + strncmp(POSIX_ACL_XATTR_ACCESS, name, + sizeof(POSIX_ACL_XATTR_ACCESS) - 1) == 0 || + strncmp(POSIX_ACL_XATTR_DEFAULT, name, + sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) == 0) + rc = mdd_changelog_data_store(env, mdd, CL_XATTR, 0, mdd_obj, + handle); stop: - mdd_trans_stop(env, mdd, rc, handle); + mdd_trans_stop(env, mdd, rc, handle); - RETURN(rc); + RETURN(rc); } static int mdd_declare_xattr_del(const struct lu_env *env, @@ -1334,6 +1334,302 @@ stop: RETURN(rc); } +/* + * read lov EA of an object + * return the lov EA in an allocated lu_buf + */ +static struct lu_buf *mdd_get_lov_ea(const struct lu_env *env, + struct mdd_object *obj) +{ + struct lu_buf *buf = &mdd_env_info(env)->mti_big_buf; + struct lu_buf *lmm_buf = NULL; + int rc, sz; + ENTRY; + +repeat: + rc = mdo_xattr_get(env, obj, buf, XATTR_NAME_LOV, + mdd_object_capa(env, obj)); + if (rc < 0) + GOTO(out, rc); + + if (rc == 0) + GOTO(out, rc = -ENODATA); + + sz = rc; + if (memcmp(buf, &LU_BUF_NULL, sizeof(*buf)) == 0) { + /* mti_big_buf was not allocated, so we have to + * allocate it based on the ea size */ + buf = mdd_buf_alloc(env, sz); + if (buf->lb_buf == NULL) + GOTO(out, rc = -ENOMEM); + goto repeat; + } + + OBD_ALLOC_PTR(lmm_buf); + if (!lmm_buf) + GOTO(out, rc = -ENOMEM); + + OBD_ALLOC(lmm_buf->lb_buf, sz); + if (!lmm_buf->lb_buf) + GOTO(free, rc = -ENOMEM); + + memcpy(lmm_buf->lb_buf, buf->lb_buf, sz); + lmm_buf->lb_len = sz; + + GOTO(out, rc = 0); + +free: + if (lmm_buf) + OBD_FREE_PTR(lmm_buf); +out: + if (rc) + return ERR_PTR(rc); + return lmm_buf; +} + + +/* + * check if layout swapping between 2 objects is allowed + * the rules are: + * - same type of objects + * - same owner/group (so quotas are still valid) + */ +static int mdd_layout_swap_allowed(const struct lu_env *env, + struct mdd_object *o1, + struct mdd_object *o2) +{ + const struct lu_fid *fid1, *fid2; + __u32 uid, gid; + struct lu_attr *tmp_la = &mdd_env_info(env)->mti_la; + int rc; + ENTRY; + + fid1 = mdo2fid(o1); + fid2 = mdo2fid(o2); + + if (!fid_is_norm(fid1) || !fid_is_norm(fid2) || + (mdd_object_type(o1) != mdd_object_type(o2))) + RETURN(-EPERM); + + tmp_la->la_valid = 0; + rc = mdd_la_get(env, o1, tmp_la, BYPASS_CAPA); + if (rc) + RETURN(rc); + uid = tmp_la->la_uid; + gid = tmp_la->la_gid; + + tmp_la->la_valid = 0; + rc = mdd_la_get(env, o2, tmp_la, BYPASS_CAPA); + if (rc) + RETURN(rc); + + if ((uid != tmp_la->la_uid) || (gid != tmp_la->la_gid)) + RETURN(-EPERM); + + RETURN(0); +} + +/** + * swap layouts between 2 lustre objects + */ +static int mdd_swap_layouts(const struct lu_env *env, struct md_object *obj1, + struct md_object *obj2, __u64 flags) +{ + struct mdd_object *o1, *o2, *fst_o, *snd_o; + struct lu_buf *lmm1_buf = NULL, *lmm2_buf = NULL; + struct lu_buf *fst_buf, *snd_buf; + struct lov_mds_md *fst_lmm, *snd_lmm, *old_fst_lmm = NULL; + struct thandle *handle; + struct mdd_device *mdd = mdo2mdd(obj1); + int rc; + __u16 fst_gen, snd_gen; + ENTRY; + + /* we have to sort the 2 obj, so locking will always + * be in the same order, even in case of 2 concurrent swaps */ + rc = lu_fid_cmp(mdo2fid(md2mdd_obj(obj1)), + mdo2fid(md2mdd_obj(obj2))); + /* same fid ? */ + if (rc == 0) + RETURN(-EPERM); + + if (rc > 0) { + o1 = md2mdd_obj(obj1); + o2 = md2mdd_obj(obj2); + } else { + o1 = md2mdd_obj(obj2); + o2 = md2mdd_obj(obj1); + } + + /* check if layout swapping is allowed */ + rc = mdd_layout_swap_allowed(env, o1, o2); + if (rc) + RETURN(rc); + + handle = mdd_trans_create(env, mdd); + if (IS_ERR(handle)) + RETURN(PTR_ERR(handle)); + + /* objects are already sorted */ + mdd_write_lock(env, o1, MOR_TGT_CHILD); + mdd_write_lock(env, o2, MOR_TGT_CHILD); + + lmm1_buf = mdd_get_lov_ea(env, o1); + if (IS_ERR(lmm1_buf)) { + rc = PTR_ERR(lmm1_buf); + lmm1_buf = NULL; + if (rc != -ENODATA) + GOTO(unlock, rc); + } + + lmm2_buf = mdd_get_lov_ea(env, o2); + if (IS_ERR(lmm2_buf)) { + rc = PTR_ERR(lmm2_buf); + lmm2_buf = NULL; + if (rc != -ENODATA) + GOTO(unlock, rc); + } + + /* swapping 2 non existant layouts is a success */ + if ((lmm1_buf == NULL) && (lmm2_buf == NULL)) + GOTO(unlock, rc = 0); + + /* to help inode migration between MDT, it is better to + * start by the no layout file (if one), so we order the swap */ + if (lmm1_buf == NULL) { + fst_o = o1; + fst_buf = lmm1_buf; + snd_o = o2; + snd_buf = lmm2_buf; + } else { + fst_o = o2; + fst_buf = lmm2_buf; + snd_o = o1; + snd_buf = lmm1_buf; + } + + /* lmm and generation layout initialization */ + if (fst_buf) { + fst_lmm = fst_buf->lb_buf; + fst_gen = le16_to_cpu(fst_lmm->lmm_layout_gen); + } else { + fst_lmm = NULL; + fst_gen = 0; + } + + if (snd_buf) { + snd_lmm = snd_buf->lb_buf; + snd_gen = le16_to_cpu(snd_lmm->lmm_layout_gen); + } else { + snd_lmm = NULL; + snd_gen = 0; + } + + /* save the orignal lmm common header of first file + * to be able to roll back */ + OBD_ALLOC_PTR(old_fst_lmm); + if (old_fst_lmm == NULL) + GOTO(unlock, rc = -ENOMEM); + + memcpy(old_fst_lmm, fst_lmm, sizeof(*old_fst_lmm)); + + /* increase the generation layout numbers */ + snd_gen++; + fst_gen++; + + /* set the file specific informations in lmm */ + if (fst_lmm) { + fst_lmm->lmm_layout_gen = cpu_to_le16(snd_gen); + fst_lmm->lmm_object_seq = snd_lmm->lmm_object_seq; + fst_lmm->lmm_object_id = snd_lmm->lmm_object_id; + } + + if (snd_lmm) { + snd_lmm->lmm_layout_gen = cpu_to_le16(fst_gen); + snd_lmm->lmm_object_seq = old_fst_lmm->lmm_object_seq; + snd_lmm->lmm_object_id = old_fst_lmm->lmm_object_id; + } + + /* prepare transaction */ + rc = mdd_declare_xattr_set(env, mdd, fst_o, snd_buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle); + if (rc) + GOTO(stop, rc); + + rc = mdd_declare_xattr_set(env, mdd, snd_o, fst_buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle); + if (rc) + GOTO(stop, rc); + + rc = mdd_trans_start(env, mdd, handle); + if (rc) + GOTO(stop, rc); + + rc = mdo_xattr_set(env, fst_o, snd_buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle, + mdd_object_capa(env, fst_o)); + if (rc) + GOTO(stop, rc); + + rc = mdo_xattr_set(env, snd_o, fst_buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle, + mdd_object_capa(env, snd_o)); + if (rc) { + int rc2; + + /* failure on second file, but first was done, so we have + * to roll back first */ + /* restore object_id, object_seq and generation number + * on first file */ + if (fst_lmm) { + fst_lmm->lmm_object_id = old_fst_lmm->lmm_object_id; + fst_lmm->lmm_object_seq = old_fst_lmm->lmm_object_seq; + fst_lmm->lmm_layout_gen = old_fst_lmm->lmm_layout_gen; + } + + rc2 = mdo_xattr_set(env, fst_o, fst_buf, XATTR_NAME_LOV, + LU_XATTR_REPLACE, handle, + mdd_object_capa(env, fst_o)); + if (rc2) { + /* very bad day */ + CERROR("%s: unable to roll back after swap layouts" + " failure between "DFID" and "DFID + " rc2 = %d rc = %d)\n", + mdd2obd_dev(mdd)->obd_name, + PFID(mdo2fid(snd_o)), PFID(mdo2fid(fst_o)), + rc2, rc); + /* a solution to avoid journal commit is to panic, + * but it has strong consequences so we use LBUG to + * allow sysdamin to choose to panic or not + */ + LBUG(); + } + GOTO(stop, rc); + } + EXIT; + +stop: + mdd_trans_stop(env, mdd, rc, handle); +unlock: + mdd_write_unlock(env, o2); + mdd_write_unlock(env, o1); + + if (lmm1_buf && lmm1_buf->lb_buf) + OBD_FREE(lmm1_buf->lb_buf, lmm1_buf->lb_len); + if (lmm1_buf) + OBD_FREE_PTR(lmm1_buf); + + if (lmm2_buf && lmm2_buf->lb_buf) + OBD_FREE(lmm2_buf->lb_buf, lmm2_buf->lb_len); + if (lmm2_buf) + OBD_FREE_PTR(lmm2_buf); + + if (old_fst_lmm) + OBD_FREE_PTR(old_fst_lmm); + + return rc; +} + void mdd_object_make_hint(const struct lu_env *env, struct mdd_object *parent, struct mdd_object *child, struct lu_attr *attr) { @@ -1793,19 +2089,20 @@ static int mdd_object_sync(const struct lu_env *env, struct md_object *obj) } const struct md_object_operations mdd_obj_ops = { - .moo_permission = mdd_permission, - .moo_attr_get = mdd_attr_get, - .moo_attr_set = mdd_attr_set, - .moo_xattr_get = mdd_xattr_get, - .moo_xattr_set = mdd_xattr_set, - .moo_xattr_list = mdd_xattr_list, - .moo_xattr_del = mdd_xattr_del, - .moo_open = mdd_open, - .moo_close = mdd_close, - .moo_readpage = mdd_readpage, - .moo_readlink = mdd_readlink, - .moo_changelog = mdd_changelog, - .moo_capa_get = mdd_capa_get, - .moo_object_sync = mdd_object_sync, - .moo_path = mdd_path, + .moo_permission = mdd_permission, + .moo_attr_get = mdd_attr_get, + .moo_attr_set = mdd_attr_set, + .moo_xattr_get = mdd_xattr_get, + .moo_xattr_set = mdd_xattr_set, + .moo_xattr_list = mdd_xattr_list, + .moo_xattr_del = mdd_xattr_del, + .moo_swap_layouts = mdd_swap_layouts, + .moo_open = mdd_open, + .moo_close = mdd_close, + .moo_readpage = mdd_readpage, + .moo_readlink = mdd_readlink, + .moo_changelog = mdd_changelog, + .moo_capa_get = mdd_capa_get, + .moo_object_sync = mdd_object_sync, + .moo_path = mdd_path, };