From ead6f5b2b5d3e151b4f98404bb0d253ccb992d6a Mon Sep 17 00:00:00 2001 From: jcl Date: Wed, 6 Mar 2013 22:40:42 +0100 Subject: [PATCH] LU-2445 utils: lfs migrate support Add a new command migrate to lfs utility. migrate takes sames args as setstripe and a --block option It: - create a volatile file based on new stripe information - copy file data to this volatile file - swap the 2 layouts - if --block is set, take the grouplock to block other access during copy So after migrate the file is restriped (mtime, atime are not changed). lfs_migrate is also updated to use the new call and keeps the rsync mode as a fallback. Signed-off-by: JC Lafoucriere Change-Id: I96bafb0be9bc273295c4c900c65b4028864fcbaa Reviewed-on: http://review.whamcloud.com/5620 Tested-by: Hudson Reviewed-by: Jinshan Xiong Tested-by: Maloo Reviewed-by: Andreas Dilger --- lustre/include/lustre/lustre_user.h | 7 + lustre/include/lustre/lustreapi.h | 6 +- lustre/llite/file.c | 153 +++++++++++--- lustre/lmv/lmv_obd.c | 11 +- lustre/mdd/mdd_object.c | 15 +- lustre/mdt/mdt_handler.c | 3 +- lustre/scripts/lfs_migrate | 16 +- lustre/tests/sanity.sh | 97 +++++---- lustre/utils/lfs.c | 400 +++++++++++++++++++++++++++++------- lustre/utils/liblustreapi.c | 11 +- 10 files changed, 562 insertions(+), 157 deletions(-) diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index bd03b57..ef02f90 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -572,10 +572,17 @@ struct if_quotactl { struct obd_uuid obd_uuid; }; +/* swap layout flags */ +#define SWAP_LAYOUTS_CHECK_DV1 (1 << 0) +#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1) +#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2) +#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3) struct lustre_swap_layouts { __u64 sl_flags; __u32 sl_fd; __u32 sl_gid; + __u64 sl_dv1; + __u64 sl_dv2; }; diff --git a/lustre/include/lustre/lustreapi.h b/lustre/include/lustre/lustreapi.h index 22cf4ce..c500965 100644 --- a/lustre/include/lustre/lustreapi.h +++ b/lustre/include/lustre/lustreapi.h @@ -251,8 +251,10 @@ static inline int llapi_create_volatile(char *directory, int mode) } -extern int llapi_fswap_layouts(const int fd1, const int fd2); -extern int llapi_swap_layouts(const char *path1, const char *path2); +extern int llapi_fswap_layouts(const int fd1, const int fd2, + __u64 dv1, __u64 dv2, __u64 flags); +extern int llapi_swap_layouts(const char *path1, const char *path2, + __u64 dv1, __u64 dv2, __u64 flags); /* Changelog interface. priv is private state, managed internally by these functions */ diff --git a/lustre/llite/file.c b/lustre/llite/file.c index a6dee27..36e3cf6 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -1855,65 +1855,154 @@ int ll_data_version(struct inode *inode, __u64 *data_version, RETURN(rc); } -static int ll_swap_layout(struct file *file, struct file *file2, - struct lustre_swap_layouts *lsl) -{ - struct mdc_swap_layouts msl = { .msl_flags = lsl->sl_flags }; - struct md_op_data *op_data; - struct inode *inode = file->f_dentry->d_inode; - struct inode *inode2 = file2->f_dentry->d_inode; - __u32 gid; - int rc; +struct ll_swap_stack { + struct iattr ia1, ia2; + __u64 dv1, dv2; + struct inode *inode1, *inode2; + bool check_dv1, check_dv2; +}; - if (!S_ISREG(inode2->i_mode)) - RETURN(-EINVAL); +static int ll_swap_layouts(struct file *file1, struct file *file2, + struct lustre_swap_layouts *lsl) +{ + struct mdc_swap_layouts msl; + struct md_op_data *op_data; + __u32 gid; + __u64 dv; + struct ll_swap_stack *llss = NULL; + int rc, rc1; - if (ll_permission(inode, MAY_WRITE, NULL) || - ll_permission(inode2, MAY_WRITE, NULL)) - RETURN(-EPERM); + OBD_ALLOC_PTR(llss); + if (llss == NULL) + RETURN(-ENOMEM); + + llss->inode1 = file1->f_dentry->d_inode; + llss->inode2 = file2->f_dentry->d_inode; - if (inode2->i_sb != inode->i_sb) - RETURN(-EXDEV); + if (!S_ISREG(llss->inode2->i_mode)) + GOTO(free, rc = -EINVAL); - rc = lu_fid_cmp(ll_inode2fid(inode), ll_inode2fid(inode2)); + if (ll_permission(llss->inode1, MAY_WRITE, NULL) || + ll_permission(llss->inode2, MAY_WRITE, NULL)) + GOTO(free, rc = -EPERM); + + if (llss->inode2->i_sb != llss->inode1->i_sb) + GOTO(free, rc = -EXDEV); + + /* we use 2 bool because it is easier to swap than 2 bits */ + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1) + llss->check_dv1 = true; + + if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2) + llss->check_dv2 = true; + + /* we cannot use lsl->sl_dvX directly because we may swap them */ + llss->dv1 = lsl->sl_dv1; + llss->dv2 = lsl->sl_dv2; + + rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2)); if (rc == 0) /* same file, done! */ - RETURN(0); + GOTO(free, rc = 0); if (rc < 0) { /* sequentialize it */ - swap(inode, inode2); - swap(file, file2); + swap(llss->inode1, llss->inode2); + swap(file1, file2); + swap(llss->dv1, llss->dv2); + swap(llss->check_dv1, llss->check_dv2); } gid = lsl->sl_gid; if (gid != 0) { /* application asks to flush dirty cache */ - rc = ll_get_grouplock(inode, file, gid); + rc = ll_get_grouplock(llss->inode1, file1, gid); if (rc < 0) - RETURN(rc); + GOTO(free, rc); - rc = ll_get_grouplock(inode2, file2, gid); + rc = ll_get_grouplock(llss->inode2, file2, gid); if (rc < 0) { - ll_put_grouplock(inode, file, gid); - RETURN(rc); + ll_put_grouplock(llss->inode1, file1, gid); + GOTO(free, rc); } } + /* to be able to restore mtime and atime after swap + * we need to first save them */ + if (lsl->sl_flags & + (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) { + llss->ia1.ia_mtime = llss->inode1->i_mtime; + llss->ia1.ia_atime = llss->inode1->i_atime; + llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME; + llss->ia2.ia_mtime = llss->inode2->i_mtime; + llss->ia2.ia_atime = llss->inode2->i_atime; + llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME; + } + + /* ultimate check, before swaping the layouts we check if + * dataversion has changed (if requested) */ + if (llss->check_dv1) { + rc = ll_data_version(llss->inode1, &dv, 0); + if (rc) + GOTO(putgl, rc); + if (dv != llss->dv1) + GOTO(putgl, rc = -EAGAIN); + } + + if (llss->check_dv2) { + rc = ll_data_version(llss->inode2, &dv, 0); + if (rc) + GOTO(putgl, rc); + if (dv != llss->dv2) + GOTO(putgl, rc = -EAGAIN); + } + /* struct md_op_data is used to send the swap args to the mdt * only flags is missing, so we use struct mdc_swap_layouts * through the md_op_data->op_data */ + /* flags from user space have to be converted before they are send to + * server, no flag is sent today, they are only used on the client */ + msl.msl_flags = 0; rc = -ENOMEM; - op_data = ll_prep_md_op_data(NULL, inode, inode2, NULL, 0, 0, - LUSTRE_OPC_ANY, &msl); + op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0, + 0, LUSTRE_OPC_ANY, &msl); if (op_data != NULL) { - rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(inode), - sizeof(*op_data), op_data, NULL); + rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, + ll_i2mdexp(llss->inode1), + sizeof(*op_data), op_data, NULL); ll_finish_md_op_data(op_data); } +putgl: if (gid != 0) { - ll_put_grouplock(inode2, file2, gid); - ll_put_grouplock(inode, file, gid); + ll_put_grouplock(llss->inode2, file2, gid); + ll_put_grouplock(llss->inode1, file1, gid); + } + + /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */ + if (rc != 0) + GOTO(free, rc); + + /* clear useless flags */ + if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) { + llss->ia1.ia_valid &= ~ATTR_MTIME; + llss->ia2.ia_valid &= ~ATTR_MTIME; + } + + if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) { + llss->ia1.ia_valid &= ~ATTR_ATIME; + llss->ia2.ia_valid &= ~ATTR_ATIME; } + /* update time if requested */ + rc = rc1 = 0; + if (llss->ia2.ia_valid != 0) + rc = ll_setattr(file1->f_dentry, &llss->ia2); + + if (llss->ia1.ia_valid != 0) + rc1 = ll_setattr(file2->f_dentry, &llss->ia1); + +free: + if (llss != NULL) + OBD_FREE_PTR(llss); + RETURN(rc); } @@ -1979,7 +2068,7 @@ long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) rc = -EPERM; if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */ - rc = ll_swap_layout(file, file2, &lsl); + rc = ll_swap_layouts(file, file2, &lsl); fput(file2); RETURN(rc); } diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index c868ae8..d4284e6 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -940,12 +940,17 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, case LL_IOC_HSM_ACTION: case LL_IOC_LOV_SWAP_LAYOUTS: { struct md_op_data *op_data = karg; - struct lmv_tgt_desc *tgt; + struct lmv_tgt_desc *tgt1, *tgt2; - tgt = lmv_find_target(lmv, &op_data->op_fid1); - if (!tgt->ltd_exp) + tgt1 = lmv_find_target(lmv, &op_data->op_fid1); + tgt2 = lmv_find_target(lmv, &op_data->op_fid2); + if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL)) RETURN(-EINVAL); + /* only files on same MDT can be have their layouts swapped */ + if (tgt1->ltd_idx != tgt2->ltd_idx) + RETURN(-EPERM); + rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg); break; } diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c index 5a43810..80a7091 100644 --- a/lustre/mdd/mdd_object.c +++ b/lustre/mdd/mdd_object.c @@ -1028,7 +1028,7 @@ static int mdd_xattr_set(const struct lu_env *env, struct md_object *obj, if (IS_ERR(handle)) RETURN(PTR_ERR(handle)); - rc = mdd_declare_xattr_set(env, mdd, mdd_obj, buf, name, 0, handle); + rc = mdd_declare_xattr_set(env, mdd, mdd_obj, buf, name, fl, handle); if (rc) GOTO(stop, rc); @@ -1158,6 +1158,17 @@ static struct lu_buf *mdd_get_lov_ea(const struct lu_env *env, repeat: rc = mdo_xattr_get(env, obj, buf, XATTR_NAME_LOV, mdd_object_capa(env, obj)); + + if (rc == -ERANGE) { + /* mti_big_buf is allocated but is too small + * we need to increase it */ + buf = lu_buf_check_and_alloc(&mdd_env_info(env)->mti_big_buf, + buf->lb_len * 2); + if (buf->lb_buf == NULL) + GOTO(out, rc = -ENOMEM); + goto repeat; + } + if (rc < 0) GOTO(out, rc); @@ -1258,7 +1269,7 @@ static int mdd_swap_layouts(const struct lu_env *env, struct md_object *obj1, /* we have to sort the 2 obj, so locking will always * be in the same order, even in case of 2 concurrent swaps */ rc = lu_fid_cmp(mdo2fid(md2mdd_obj(obj1)), - mdo2fid(md2mdd_obj(obj2))); + mdo2fid(md2mdd_obj(obj2))); /* same fid ? */ if (rc == 0) RETURN(-EPERM); diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index ba3f24d..7d16db7 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -1063,7 +1063,8 @@ int mdt_swap_layouts(struct mdt_thread_info *info) GOTO(put, rc); msl = req_capsule_client_get(info->mti_pill, &RMF_SWAP_LAYOUTS); - LASSERT(msl != NULL); + if (msl == NULL) + GOTO(put, rc = -EPROTO); lh1 = &info->mti_lh[MDT_LH_NEW]; mdt_lock_reg_init(lh1, LCK_EX); diff --git a/lustre/scripts/lfs_migrate b/lustre/scripts/lfs_migrate index a09c368..bf595b3 100755 --- a/lustre/scripts/lfs_migrate +++ b/lustre/scripts/lfs_migrate @@ -101,6 +101,8 @@ umask 0077 $LFS getstripe --help 2>&1 | grep -q stripe-size && LFS_SIZE_OPT="-S" lfs_migrate() { + local RSYNC_MODE=false + while IFS='' read -d '' OLDNAME; do $ECHO -n "$OLDNAME: " @@ -130,7 +132,6 @@ lfs_migrate() { continue fi - if [ "$OPT_RESTRIPE" ]; then UNLINK="" else @@ -147,6 +148,19 @@ lfs_migrate() { [ -z "$COUNT" -o -z "$SIZE" ] && UNLINK="" fi + + # first try to migrate inside lustre + # if failed go back to old rsync mode + if [[ $RSYNC_MODE == false ]]; then + $LFS migrate -c${COUNT} ${LFS_SIZE_OPT}${SIZE} $OLDNAME + if [[ $? == 0 ]]; then + $ECHO "done" + continue + else + RSYNC_MODE=true + fi + fi + NEWNAME=$(mktemp $UNLINK "$OLDNAME.tmp.XXXXXX") if [ $? -ne 0 -o -z "$NEWNAME" ]; then echo -e "\r$OLDNAME: can't make temp file, skipped" 1>&2 diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 997edf7..9c071b3 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -89,6 +89,13 @@ check_kernel_version() { return 1 } +check_swap_layouts_support() +{ + $LCTL get_param -n llite.*.sbi_flags | grep -q layout || + { skip "Does not support layout lock."; return 0; } + return 1 +} + if [ "$ONLY" == "cleanup" ]; then sh llmountcleanup.sh exit 0 @@ -4403,6 +4410,29 @@ test_56w() { } run_test 56w "check lfs_migrate -c stripe_count works" +test_56x() { + check_swap_layouts_support && return 0 + [ "$OSTCOUNT" -lt "2" ] && + skip_env "need 2 OST, skipping test" && return + + local dir0=$DIR/$tdir/$testnum + mkdir -p $dir0 || error "creating dir $dir0" + + local ref1=/etc/passwd + local file1=$dir0/file1 + + $SETSTRIPE -c 2 $file1 + cp $ref1 $file1 + $LFS migrate -c 1 $file1 || error "migrate failed rc = $?" + stripe=$($GETSTRIPE -c $file1) + [[ $stripe == 1 ]] || error "stripe of $file1 is $stripe != 1" + cmp $file1 $ref1 || error "content mismatch $file1 differs from $ref1" + + # clean up + rm -f $file1 +} +run_test 56x "lfs migration support" + test_57a() { [ $PARALLEL == "yes" ] && skip "skip parallel run" && return # note test will not do anything if MDS is not local @@ -9519,43 +9549,6 @@ test_183() { # LU-2275 } run_test 183 "No crash or request leak in case of strange dispositions ========" -test_185() { # LU-2441 - mkdir -p $DIR/$tdir || error "creating dir $DIR/$tdir" - touch $DIR/$tdir/spoo - local mtime1=$(stat -c "%Y" $DIR/$tdir) - local fid=$($MULTIOP $DIR/$tdir VFw4096c) || - error "cannot create/write a volatile file" - $CHECKSTAT -t file $MOUNT/.lustre/fid/$fid 2>/dev/null && - error "FID is still valid after close" - - multiop_bg_pause $DIR/$tdir vVw4096_c - local multi_pid=$! - - local OLD_IFS=$IFS - IFS=":" - local fidv=($fid) - IFS=$OLD_IFS - # assume that the next FID for this client is sequential, since stdout - # is unfortunately eaten by multiop_bg_pause - local n=$((${fidv[1]} + 1)) - local next_fid="${fidv[0]}:$(printf "0x%x" $n):${fidv[2]}" - $CHECKSTAT -t file $MOUNT/.lustre/fid/$next_fid || - error "FID is missing before close" - kill -USR1 $multi_pid - # 1 second delay, so if mtime change we will see it - sleep 1 - local mtime2=$(stat -c "%Y" $DIR/$tdir) - [[ $mtime1 == $mtime2 ]] || error "mtime has changed" -} -run_test 185 "Volatile file support" - -check_swap_layouts_support() -{ - $LCTL get_param -n llite.*.sbi_flags | grep -q layout || - { skip "Does not support layout lock."; return 0; } - return 1 -} - # test suite 184 is for LU-2016, LU-2017 test_184a() { check_swap_layouts_support && return 0 @@ -9662,6 +9655,36 @@ test_184c() { } run_test 184c "Concurrent write and layout swap" +test_185() { # LU-2441 + mkdir -p $DIR/$tdir || error "creating dir $DIR/$tdir" + touch $DIR/$tdir/spoo + local mtime1=$(stat -c "%Y" $DIR/$tdir) + local fid=$($MULTIOP $DIR/$tdir VFw4096c) || + error "cannot create/write a volatile file" + $CHECKSTAT -t file $MOUNT/.lustre/fid/$fid 2>/dev/null && + error "FID is still valid after close" + + multiop_bg_pause $DIR/$tdir vVw4096_c + local multi_pid=$! + + local OLD_IFS=$IFS + IFS=":" + local fidv=($fid) + IFS=$OLD_IFS + # assume that the next FID for this client is sequential, since stdout + # is unfortunately eaten by multiop_bg_pause + local n=$((${fidv[1]} + 1)) + local next_fid="${fidv[0]}:$(printf "0x%x" $n):${fidv[2]}" + $CHECKSTAT -t file $MOUNT/.lustre/fid/$next_fid || + error "FID is missing before close" + kill -USR1 $multi_pid + # 1 second delay, so if mtime change we will see it + sleep 1 + local mtime2=$(stat -c "%Y" $DIR/$tdir) + [[ $mtime1 == $mtime2 ]] || error "mtime has changed" +} +run_test 185 "Volatile file support" + # OST pools tests check_file_in_pool() { diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 8fef145..95b3a71 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -72,6 +72,8 @@ #include #include +#include +#include #include "obdctl.h" /* all functions */ @@ -118,32 +120,37 @@ static int lfs_hsm_remove(int argc, char **argv); static int lfs_hsm_cancel(int argc, char **argv); static int lfs_swap_layouts(int argc, char **argv); +#define SETSTRIPE_USAGE(_cmd, _tgt) \ + "usage: "_cmd" [--stripe-count|-c ]\n"\ + " [--stripe-index|-i ]\n"\ + " [--stripe-size|-S ]\n"\ + " [--pool|-p ]\n"\ + " [--block|-b] "_tgt"\n"\ + "\tstripe_size: Number of bytes on each OST (0 filesystem default)\n"\ + "\t Can be specified with k, m or g (in KB, MB and GB\n"\ + "\t respectively)\n"\ + "\tstart_ost_idx: OST index of first stripe (-1 default)\n"\ + "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n"\ + "\tpool_name: Name of OST pool to use (default none)\n"\ + "\tblock: Block file access during data migration" + /* all avaialable commands */ command_t cmdlist[] = { - {"setstripe", lfs_setstripe, 0, - "Create a new file with a specific striping pattern or\n" - "set the default striping pattern on an existing directory or\n" - "delete the default striping pattern from an existing directory\n" - "usage: setstripe [--stripe-count|-c ]\n" - " [--stripe-index|-i ]\n" - " [--stripe-size|-S ]\n" - " [--pool|-p ] \n" - " or\n" - " setstripe -d (to delete default striping)\n" - "\tstripe_size: Number of bytes on each OST (0 filesystem default)\n" - "\t Can be specified with k, m or g (in KB, MB and GB\n" - "\t respectively)\n" - "\tstart_ost_idx: OST index of first stripe (-1 default)\n" - "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n" - "\tpool_name: Name of OST pool to use (default none)"}, - {"getstripe", lfs_getstripe, 0, - "To list the striping info for a given file or files in a\n" - "directory or recursively for all files in a directory tree.\n" - "usage: getstripe [--ost|-O ] [--quiet | -q] [--verbose | -v]\n" - " [--stripe-count|-c] [--stripe-index|-i]\n" - " [--pool|-p] [--stripe-size|-S] [--directory|-d]\n" - " [--mdt-index|-M] [--recursive|-r] [--raw|-R]\n" - " ..."}, + {"setstripe", lfs_setstripe, 0, + "Create a new file with a specific striping pattern or\n" + "set the default striping pattern on an existing directory or\n" + "delete the default striping pattern from an existing directory\n" + "usage: setstripe -d (to delete default striping)\n"\ + " or\n" + SETSTRIPE_USAGE("setstripe", "")}, + {"getstripe", lfs_getstripe, 0, + "To list the striping info for a given file or files in a\n" + "directory or recursively for all files in a directory tree.\n" + "usage: getstripe [--ost|-O ] [--quiet | -q] [--verbose | -v]\n" + " [--stripe-count|-c] [--stripe-index|-i]\n" + " [--pool|-p] [--stripe-size|-S] [--directory|-d]\n" + " [--mdt-index|-M] [--recursive|-r] [--raw|-R]\n" + " ..."}, {"setdirstripe", lfs_setdirstripe, 0, "To create a remote directory on a specified MDT.\n" "usage: setdirstripe <--index|-i mdt_index> \n" @@ -301,6 +308,9 @@ command_t cmdlist[] = { "usage: hsm_cancel [--filelist FILELIST] [--data DATA] ..."}, {"swap_layouts", lfs_swap_layouts, 0, "Swap layouts between 2 files.\n" "usage: swap_layouts "}, + {"migrate", lfs_setstripe, 0, "migrate file from one layout to " + "another (may be not safe with concurent writes).\n" + SETSTRIPE_USAGE("migrate ", "")}, {"help", Parser_help, 0, "help"}, {"exit", Parser_quit, 0, "quit"}, {"quit", Parser_quit, 0, "quit"}, @@ -322,70 +332,292 @@ static int isnumber(const char *str) return 1; } +#define MIGRATION_BLOCKS 1 + +static int lfs_migrate(char *name, unsigned long long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern, char *pool_name, + __u64 migration_flags) +{ + int fd, fdv; + char volatile_file[PATH_MAX]; + char parent[PATH_MAX]; + char *ptr; + int rc; + __u64 dv1; + struct lov_user_md *lum = NULL; + int lumsz; + int bufsz; + void *buf = NULL; + int rsize, wsize; + __u64 rpos, wpos, bufoff; + int gid = 0, sz; + int have_gl = 0; + + /* find the right size for the IO and allocate the buffer */ + lumsz = lov_mds_md_size(LOV_MAX_STRIPE_COUNT, LOV_MAGIC_V3); + lum = malloc(lumsz); + if (lum == NULL) { + rc = -ENOMEM; + goto free; + } + + rc = llapi_file_get_stripe(name, lum); + /* failure can come from may case and some may be not real error + * (eg: no stripe) + * in case of a real error, a later call will failed with a better + * error management */ + if (rc < 0) + bufsz = 1024*1024; + else + bufsz = lum->lmm_stripe_size; + rc = posix_memalign(&buf, getpagesize(), bufsz); + if (rc != 0) { + rc = -rc; + goto free; + } + + if (migration_flags & MIGRATION_BLOCKS) { + /* generate a random id for the grouplock */ + fd = open("/dev/urandom", O_RDONLY); + if (fd == -1) { + rc = -errno; + fprintf(stderr, "cannot open /dev/urandom (%s)\n", + strerror(-rc)); + goto free; + } + sz = sizeof(gid); + rc = read(fd, &gid, sz); + close(fd); + if (rc < sz) { + rc = -errno; + fprintf(stderr, "cannot read %d bytes from" + " /dev/urandom (%s)\n", sz, strerror(-rc)); + goto free; + } + } + + /* search for file directory pathname */ + strcpy(parent, name); + ptr = strrchr(parent, '/'); + if (ptr == NULL) { + if (getcwd(parent, sizeof(parent)) == NULL) { + rc = -errno; + goto free; + } + } else { + if (ptr == parent) + strcpy(parent, "/"); + else + *ptr = '\0'; + } + sprintf(volatile_file, "%s/%s::", parent, LUSTRE_VOLATILE_HDR); + + /* create, open a volatile file, use caching (ie no directio) */ + /* exclusive create is not needed because volatile files cannot + * conflict on name by construction */ + fdv = llapi_file_open_pool(volatile_file, O_CREAT | O_WRONLY, + 0644, stripe_size, stripe_offset, + stripe_count, stripe_pattern, pool_name); + if (fdv < 0) { + rc = fdv; + fprintf(stderr, "cannot create volatile file in %s (%s)\n", + parent, strerror(-rc)); + goto free; + } + + /* open file, direct io */ + /* even if the file is only read, WR mode is nedeed to allow + * layout swap on fd */ + fd = open(name, O_RDWR | O_DIRECT); + if (fd == -1) { + rc = -errno; + fprintf(stderr, "cannot open %s (%s)\n", name, strerror(-rc)); + close(fdv); + goto free; + } + + /* get file data version */ + rc = llapi_get_data_version(fd, &dv1, 0); + if (rc != 0) { + fprintf(stderr, "cannot get dataversion on %s (%s)\n", + name, strerror(-rc)); + goto error; + } + + if (migration_flags & MIGRATION_BLOCKS) { + /* take group lock to limit concurent access + * this will be no more needed when exclusive access will + * be implemented (see LU-2919) */ + /* group lock is taken after data version read because it + * blocks data version call */ + if (ioctl(fd, LL_IOC_GROUP_LOCK, gid) == -1) { + rc = -errno; + fprintf(stderr, "cannot get group lock on %s (%s)\n", + name, strerror(-rc)); + goto error; + } + have_gl = 1; + } + + /* copy data */ + rpos = 0; + wpos = 0; + bufoff = 0; + rsize = -1; + do { + /* read new data only if we have written all + * previously read data */ + if (wpos == rpos) { + rsize = read(fd, buf, bufsz); + if (rsize < 0) { + rc = -errno; + fprintf(stderr, "read failed on %s" + " (%s)\n", name, + strerror(-rc)); + goto error; + } + rpos += rsize; + bufoff = 0; + } + /* eof ? */ + if (rsize == 0) + break; + wsize = write(fdv, buf + bufoff, rpos - wpos); + if (wsize < 0) { + rc = -errno; + fprintf(stderr, "write failed on volatile" + " for %s (%s)\n", name, strerror(-rc)); + goto error; + } + wpos += wsize; + bufoff += wsize; + } while (1); + + /* flush data */ + fsync(fdv); + + if (migration_flags & MIGRATION_BLOCKS) { + /* give back group lock */ + if (ioctl(fd, LL_IOC_GROUP_UNLOCK, gid) == -1) { + rc = -errno; + fprintf(stderr, "cannot put group lock on %s (%s)\n", + name, strerror(-rc)); + } + have_gl = 0; + } + + /* swap layouts + * for a migration we need to: + * - check data version on file did not change + * - keep file mtime + * - keep file atime + */ + rc = llapi_fswap_layouts(fd, fdv, dv1, 0, + SWAP_LAYOUTS_CHECK_DV1 | + SWAP_LAYOUTS_KEEP_MTIME | + SWAP_LAYOUTS_KEEP_ATIME); + if (rc == -EAGAIN) { + fprintf(stderr, "file dataversion for %s has changed" + " during copy, migration is aborted\n", + name); + goto error; + } + if (rc != 0) + fprintf(stderr, "cannot swap layouts between %s and " + "a volatile file (%s)\n", + name, strerror(-rc)); + +error: + /* give back group lock */ + if ((migration_flags & MIGRATION_BLOCKS) && have_gl && + (ioctl(fd, LL_IOC_GROUP_UNLOCK, gid) == -1)) { + /* we keep in rc the original error */ + fprintf(stderr, "cannot put group lock on %s (%s)\n", + name, strerror(-errno)); + } + + close(fdv); + close(fd); +free: + if (lum) + free(lum); + if (buf) + free(buf); + return rc; +} + /* functions */ static int lfs_setstripe(int argc, char **argv) { - char *fname; - int result; - unsigned long long st_size; - int st_offset, st_count; - char *end; - int c; - int delete = 0; - char *stripe_size_arg = NULL; - char *stripe_off_arg = NULL; - char *stripe_count_arg = NULL; - char *pool_name_arg = NULL; - unsigned long long size_units = 1; + char *fname; + int result; + unsigned long long st_size; + int st_offset, st_count; + char *end; + int c; + int delete = 0; + char *stripe_size_arg = NULL; + char *stripe_off_arg = NULL; + char *stripe_count_arg = NULL; + char *pool_name_arg = NULL; + unsigned long long size_units = 1; + int migrate_mode = 0; + __u64 migration_flags = 0; - struct option long_opts[] = { + struct option long_opts[] = { + /* valid only in migrate mode */ + {"block", no_argument, 0, 'b'}, #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0) #warning "remove deprecated --count option" #else - /* This formerly implied "stripe-count", but was explicitly - * made "stripe-count" for consistency with other options, - * and to separate it from "mdt-count" when DNE arrives. */ - {"count", required_argument, 0, 'c'}, + /* This formerly implied "stripe-count", but was explicitly + * made "stripe-count" for consistency with other options, + * and to separate it from "mdt-count" when DNE arrives. */ + {"count", required_argument, 0, 'c'}, #endif - {"stripe-count", required_argument, 0, 'c'}, - {"stripe_count", required_argument, 0, 'c'}, - {"delete", no_argument, 0, 'd'}, + {"stripe-count", required_argument, 0, 'c'}, + {"stripe_count", required_argument, 0, 'c'}, + {"delete", no_argument, 0, 'd'}, #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0) #warning "remove deprecated --index option" #else - /* This formerly implied "stripe-index", but was explicitly - * made "stripe-index" for consistency with other options, - * and to separate it from "mdt-index" when DNE arrives. */ - {"index", required_argument, 0, 'i'}, + /* This formerly implied "stripe-index", but was explicitly + * made "stripe-index" for consistency with other options, + * and to separate it from "mdt-index" when DNE arrives. */ + {"index", required_argument, 0, 'i'}, #endif - {"stripe-index", required_argument, 0, 'i'}, - {"stripe_index", required_argument, 0, 'i'}, + {"stripe-index", required_argument, 0, 'i'}, + {"stripe_index", required_argument, 0, 'i'}, #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0) #warning "remove deprecated --offset option" #else - /* This formerly implied "stripe-index", but was confusing - * with "file offset" (which will eventually be needed for - * with different layouts by offset), so deprecate it. */ - {"offset", required_argument, 0, 'o'}, + /* This formerly implied "stripe-index", but was confusing + * with "file offset" (which will eventually be needed for + * with different layouts by offset), so deprecate it. */ + {"offset", required_argument, 0, 'o'}, #endif - {"pool", required_argument, 0, 'p'}, + {"pool", required_argument, 0, 'p'}, #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0) #warning "remove deprecated --size option" #else - /* This formerly implied "--stripe-size", but was confusing - * with "lfs find --size|-s", which means "file size", so use - * the consistent "--stripe-size|-S" for all commands. */ - {"size", required_argument, 0, 's'}, + /* This formerly implied "--stripe-size", but was confusing + * with "lfs find --size|-s", which means "file size", so use + * the consistent "--stripe-size|-S" for all commands. */ + {"size", required_argument, 0, 's'}, #endif - {"stripe-size", required_argument, 0, 'S'}, - {"stripe_size", required_argument, 0, 'S'}, - {0, 0, 0, 0} - }; + {"stripe-size", required_argument, 0, 'S'}, + {"stripe_size", required_argument, 0, 'S'}, + {0, 0, 0, 0} + }; st_size = 0; st_offset = -1; st_count = 0; + if (strcmp(argv[0], "migrate") == 0) + migrate_mode = 1; + #if LUSTRE_VERSION < OBD_OCD_VERSION(2,4,50,0) if (argc == 5 && argv[1][0] != '-' && isnumber(argv[2]) && isnumber(argv[3]) && isnumber(argv[4])) { @@ -404,6 +636,14 @@ static int lfs_setstripe(int argc, char **argv) case 0: /* Long options. */ break; + case 'b': + if (migrate_mode == 0) { + fprintf(stderr, "--block is valid only for" + " migrate mode"); + return CMD_HELP; + } + migration_flags |= MIGRATION_BLOCKS; + break; case 'c': #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0) #warning "remove deprecated --count option" @@ -502,18 +742,26 @@ static int lfs_setstripe(int argc, char **argv) } } - do { - result = llapi_file_create_pool(fname, st_size, st_offset, - st_count, 0, pool_name_arg); - if (result) { - fprintf(stderr,"error: %s: create stripe file '%s' " - "failed\n", argv[0], fname); - break; - } - fname = argv[++optind]; - } while (fname != NULL); + do { + if (migrate_mode) + result = lfs_migrate(fname, st_size, st_offset, + st_count, 0, pool_name_arg, + migration_flags); + else + result = llapi_file_create_pool(fname, st_size, + st_offset, st_count, + 0, pool_name_arg); + if (result) { + fprintf(stderr, + "error: %s: %s stripe file '%s' failed\n", + argv[0], migrate_mode ? "migrate" : "create", + fname); + break; + } + fname = argv[++optind]; + } while (fname != NULL); - return result; + return result; } static int lfs_poollist(int argc, char **argv) @@ -3315,7 +3563,9 @@ static int lfs_swap_layouts(int argc, char **argv) if (argc != 3) return CMD_HELP; - return llapi_swap_layouts(argv[1], argv[2]); + return llapi_swap_layouts(argv[1], argv[2], 0, 0, + SWAP_LAYOUTS_KEEP_MTIME | + SWAP_LAYOUTS_KEEP_ATIME); } int main(int argc, char **argv) diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index ee5bbf1..fb1e69f 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -4144,15 +4144,17 @@ int llapi_create_volatile_idx(char *directory, int idx, int mode) * first fd received the ioctl, second fd is passed as arg * this is assymetric but avoid use of root path for ioctl */ -int llapi_fswap_layouts(int fd1, int fd2) +int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags) { struct lustre_swap_layouts lsl; int rc; srandom(time(NULL)); lsl.sl_fd = fd2; - lsl.sl_flags = 0; + lsl.sl_flags = flags; lsl.sl_gid = random(); + lsl.sl_dv1 = dv1; + lsl.sl_dv2 = dv2; rc = ioctl(fd1, LL_IOC_LOV_SWAP_LAYOUTS, &lsl); if (rc) rc = -errno; @@ -4163,7 +4165,8 @@ int llapi_fswap_layouts(int fd1, int fd2) * Swap the layouts between 2 files * the 2 files are open in write */ -int llapi_swap_layouts(const char *path1, const char *path2) +int llapi_swap_layouts(const char *path1, const char *path2, + __u64 dv1, __u64 dv2, __u64 flags) { int fd1, fd2, rc; @@ -4184,7 +4187,7 @@ int llapi_swap_layouts(const char *path1, const char *path2) return -errno; } - rc = llapi_fswap_layouts(fd1, fd2); + rc = llapi_fswap_layouts(fd1, fd2, dv1, dv2, flags); if (rc < 0) llapi_error(LLAPI_MSG_ERROR, rc, "error: cannot swap layouts between %s and %s\n", -- 1.8.3.1