Whamcloud - gitweb
LU-2445 utils: lfs migrate support
authorjcl <jacques-charles.lafoucriere@cea.fr>
Wed, 6 Mar 2013 21:40:42 +0000 (22:40 +0100)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 26 Mar 2013 04:43:18 +0000 (00:43 -0400)
Add a new command migrate to lfs utility.
migrate takes sames args as setstripe and a --block option
It:
- create a volatile file based on new stripe information
- copy file data to this volatile file
- swap the 2 layouts
- if --block is set, take the grouplock to block other access
  during copy

So after migrate the file is restriped (mtime, atime are not
changed).
lfs_migrate is also updated to use the new call and
keeps the rsync mode as a fallback.

Signed-off-by: JC Lafoucriere <jacques-charles.lafoucriere@cea.fr>
Change-Id: I96bafb0be9bc273295c4c900c65b4028864fcbaa
Reviewed-on: http://review.whamcloud.com/5620
Tested-by: Hudson
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
lustre/include/lustre/lustre_user.h
lustre/include/lustre/lustreapi.h
lustre/llite/file.c
lustre/lmv/lmv_obd.c
lustre/mdd/mdd_object.c
lustre/mdt/mdt_handler.c
lustre/scripts/lfs_migrate
lustre/tests/sanity.sh
lustre/utils/lfs.c
lustre/utils/liblustreapi.c

index bd03b57..ef02f90 100644 (file)
@@ -572,10 +572,17 @@ struct if_quotactl {
         struct obd_uuid         obd_uuid;
 };
 
+/* swap layout flags */
+#define        SWAP_LAYOUTS_CHECK_DV1          (1 << 0)
+#define        SWAP_LAYOUTS_CHECK_DV2          (1 << 1)
+#define        SWAP_LAYOUTS_KEEP_MTIME         (1 << 2)
+#define        SWAP_LAYOUTS_KEEP_ATIME         (1 << 3)
 struct lustre_swap_layouts {
        __u64   sl_flags;
        __u32   sl_fd;
        __u32   sl_gid;
+       __u64   sl_dv1;
+       __u64   sl_dv2;
 };
 
 
index 22cf4ce..c500965 100644 (file)
@@ -251,8 +251,10 @@ static inline int llapi_create_volatile(char *directory, int mode)
 }
 
 
-extern int llapi_fswap_layouts(const int fd1, const int fd2);
-extern int llapi_swap_layouts(const char *path1, const char *path2);
+extern int llapi_fswap_layouts(const int fd1, const int fd2,
+                              __u64 dv1, __u64 dv2, __u64 flags);
+extern int llapi_swap_layouts(const char *path1, const char *path2,
+                             __u64 dv1, __u64 dv2, __u64 flags);
 
 /* Changelog interface.  priv is private state, managed internally
    by these functions */
index a6dee27..36e3cf6 100644 (file)
@@ -1855,65 +1855,154 @@ int ll_data_version(struct inode *inode, __u64 *data_version,
        RETURN(rc);
 }
 
-static int ll_swap_layout(struct file *file, struct file *file2,
-                       struct lustre_swap_layouts *lsl)
-{
-       struct mdc_swap_layouts  msl = { .msl_flags = lsl->sl_flags };
-       struct md_op_data       *op_data;
-       struct inode            *inode = file->f_dentry->d_inode;
-       struct inode            *inode2 = file2->f_dentry->d_inode;
-       __u32 gid;
-       int rc;
+struct ll_swap_stack {
+       struct iattr             ia1, ia2;
+       __u64                    dv1, dv2;
+       struct inode            *inode1, *inode2;
+       bool                     check_dv1, check_dv2;
+};
 
-       if (!S_ISREG(inode2->i_mode))
-               RETURN(-EINVAL);
+static int ll_swap_layouts(struct file *file1, struct file *file2,
+                          struct lustre_swap_layouts *lsl)
+{
+       struct mdc_swap_layouts  msl;
+       struct md_op_data       *op_data;
+       __u32                    gid;
+       __u64                    dv;
+       struct ll_swap_stack    *llss = NULL;
+       int                      rc, rc1;
 
-       if (ll_permission(inode, MAY_WRITE, NULL) ||
-           ll_permission(inode2, MAY_WRITE, NULL))
-               RETURN(-EPERM);
+       OBD_ALLOC_PTR(llss);
+       if (llss == NULL)
+               RETURN(-ENOMEM);
+
+       llss->inode1 = file1->f_dentry->d_inode;
+       llss->inode2 = file2->f_dentry->d_inode;
 
-       if (inode2->i_sb != inode->i_sb)
-               RETURN(-EXDEV);
+       if (!S_ISREG(llss->inode2->i_mode))
+               GOTO(free, rc = -EINVAL);
 
-       rc = lu_fid_cmp(ll_inode2fid(inode), ll_inode2fid(inode2));
+       if (ll_permission(llss->inode1, MAY_WRITE, NULL) ||
+           ll_permission(llss->inode2, MAY_WRITE, NULL))
+               GOTO(free, rc = -EPERM);
+
+       if (llss->inode2->i_sb != llss->inode1->i_sb)
+               GOTO(free, rc = -EXDEV);
+
+       /* we use 2 bool because it is easier to swap than 2 bits */
+       if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
+               llss->check_dv1 = true;
+
+       if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
+               llss->check_dv2 = true;
+
+       /* we cannot use lsl->sl_dvX directly because we may swap them */
+       llss->dv1 = lsl->sl_dv1;
+       llss->dv2 = lsl->sl_dv2;
+
+       rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
        if (rc == 0) /* same file, done! */
-               RETURN(0);
+               GOTO(free, rc = 0);
 
        if (rc < 0) { /* sequentialize it */
-               swap(inode, inode2);
-               swap(file, file2);
+               swap(llss->inode1, llss->inode2);
+               swap(file1, file2);
+               swap(llss->dv1, llss->dv2);
+               swap(llss->check_dv1, llss->check_dv2);
        }
 
        gid = lsl->sl_gid;
        if (gid != 0) { /* application asks to flush dirty cache */
-               rc = ll_get_grouplock(inode, file, gid);
+               rc = ll_get_grouplock(llss->inode1, file1, gid);
                if (rc < 0)
-                       RETURN(rc);
+                       GOTO(free, rc);
 
-               rc = ll_get_grouplock(inode2, file2, gid);
+               rc = ll_get_grouplock(llss->inode2, file2, gid);
                if (rc < 0) {
-                       ll_put_grouplock(inode, file, gid);
-                       RETURN(rc);
+                       ll_put_grouplock(llss->inode1, file1, gid);
+                       GOTO(free, rc);
                }
        }
 
+       /* to be able to restore mtime and atime after swap
+        * we need to first save them */
+       if (lsl->sl_flags &
+           (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
+               llss->ia1.ia_mtime = llss->inode1->i_mtime;
+               llss->ia1.ia_atime = llss->inode1->i_atime;
+               llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
+               llss->ia2.ia_mtime = llss->inode2->i_mtime;
+               llss->ia2.ia_atime = llss->inode2->i_atime;
+               llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
+       }
+
+       /* ultimate check, before swaping the layouts we check if
+        * dataversion has changed (if requested) */
+       if (llss->check_dv1) {
+               rc = ll_data_version(llss->inode1, &dv, 0);
+               if (rc)
+                       GOTO(putgl, rc);
+               if (dv != llss->dv1)
+                       GOTO(putgl, rc = -EAGAIN);
+       }
+
+       if (llss->check_dv2) {
+               rc = ll_data_version(llss->inode2, &dv, 0);
+               if (rc)
+                       GOTO(putgl, rc);
+               if (dv != llss->dv2)
+                       GOTO(putgl, rc = -EAGAIN);
+       }
+
        /* struct md_op_data is used to send the swap args to the mdt
         * only flags is missing, so we use struct mdc_swap_layouts
         * through the md_op_data->op_data */
+       /* flags from user space have to be converted before they are send to
+        * server, no flag is sent today, they are only used on the client */
+       msl.msl_flags = 0;
        rc = -ENOMEM;
-       op_data = ll_prep_md_op_data(NULL, inode, inode2, NULL, 0, 0,
-                                       LUSTRE_OPC_ANY, &msl);
+       op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
+                                    0, LUSTRE_OPC_ANY, &msl);
        if (op_data != NULL) {
-               rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(inode),
-                                       sizeof(*op_data), op_data, NULL);
+               rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS,
+                                  ll_i2mdexp(llss->inode1),
+                                  sizeof(*op_data), op_data, NULL);
                ll_finish_md_op_data(op_data);
        }
 
+putgl:
        if (gid != 0) {
-               ll_put_grouplock(inode2, file2, gid);
-               ll_put_grouplock(inode, file, gid);
+               ll_put_grouplock(llss->inode2, file2, gid);
+               ll_put_grouplock(llss->inode1, file1, gid);
+       }
+
+       /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
+       if (rc != 0)
+               GOTO(free, rc);
+
+       /* clear useless flags */
+       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
+               llss->ia1.ia_valid &= ~ATTR_MTIME;
+               llss->ia2.ia_valid &= ~ATTR_MTIME;
+       }
+
+       if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
+               llss->ia1.ia_valid &= ~ATTR_ATIME;
+               llss->ia2.ia_valid &= ~ATTR_ATIME;
        }
 
+       /* update time if requested */
+       rc = rc1 = 0;
+       if (llss->ia2.ia_valid != 0)
+               rc = ll_setattr(file1->f_dentry, &llss->ia2);
+
+       if (llss->ia1.ia_valid != 0)
+               rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
+
+free:
+       if (llss != NULL)
+               OBD_FREE_PTR(llss);
+
        RETURN(rc);
 }
 
@@ -1979,7 +2068,7 @@ long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
                rc = -EPERM;
                if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
-                       rc = ll_swap_layout(file, file2, &lsl);
+                       rc = ll_swap_layouts(file, file2, &lsl);
                fput(file2);
                RETURN(rc);
        }
index c868ae8..d4284e6 100644 (file)
@@ -940,12 +940,17 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp,
        case LL_IOC_HSM_ACTION:
        case LL_IOC_LOV_SWAP_LAYOUTS: {
                struct md_op_data       *op_data = karg;
-               struct lmv_tgt_desc     *tgt;
+               struct lmv_tgt_desc     *tgt1, *tgt2;
 
-               tgt = lmv_find_target(lmv, &op_data->op_fid1);
-               if (!tgt->ltd_exp)
+               tgt1 = lmv_find_target(lmv, &op_data->op_fid1);
+               tgt2 = lmv_find_target(lmv, &op_data->op_fid2);
+               if ((tgt1->ltd_exp == NULL) || (tgt2->ltd_exp == NULL))
                        RETURN(-EINVAL);
 
+               /* only files on same MDT can be have their layouts swapped */
+               if (tgt1->ltd_idx != tgt2->ltd_idx)
+                       RETURN(-EPERM);
+
                rc = obd_iocontrol(cmd, lmv->tgts[0]->ltd_exp, len, karg, uarg);
                break;
        }
index 5a43810..80a7091 100644 (file)
@@ -1028,7 +1028,7 @@ static int mdd_xattr_set(const struct lu_env *env, struct md_object *obj,
        if (IS_ERR(handle))
                RETURN(PTR_ERR(handle));
 
-       rc = mdd_declare_xattr_set(env, mdd, mdd_obj, buf, name, 0, handle);
+       rc = mdd_declare_xattr_set(env, mdd, mdd_obj, buf, name, fl, handle);
        if (rc)
                GOTO(stop, rc);
 
@@ -1158,6 +1158,17 @@ static struct lu_buf *mdd_get_lov_ea(const struct lu_env *env,
 repeat:
        rc = mdo_xattr_get(env, obj, buf, XATTR_NAME_LOV,
                           mdd_object_capa(env, obj));
+
+       if (rc == -ERANGE) {
+               /* mti_big_buf is allocated but is too small
+                * we need to increase it */
+               buf = lu_buf_check_and_alloc(&mdd_env_info(env)->mti_big_buf,
+                                            buf->lb_len * 2);
+               if (buf->lb_buf == NULL)
+                       GOTO(out, rc = -ENOMEM);
+               goto repeat;
+       }
+
        if (rc < 0)
                GOTO(out, rc);
 
@@ -1258,7 +1269,7 @@ static int mdd_swap_layouts(const struct lu_env *env, struct md_object *obj1,
        /* we have to sort the 2 obj, so locking will always
         * be in the same order, even in case of 2 concurrent swaps */
        rc = lu_fid_cmp(mdo2fid(md2mdd_obj(obj1)),
-                      mdo2fid(md2mdd_obj(obj2)));
+                       mdo2fid(md2mdd_obj(obj2)));
        /* same fid ? */
        if (rc == 0)
                RETURN(-EPERM);
index ba3f24d..7d16db7 100644 (file)
@@ -1063,7 +1063,8 @@ int mdt_swap_layouts(struct mdt_thread_info *info)
                GOTO(put, rc);
 
        msl = req_capsule_client_get(info->mti_pill, &RMF_SWAP_LAYOUTS);
-       LASSERT(msl != NULL);
+       if (msl == NULL)
+               GOTO(put, rc = -EPROTO);
 
        lh1 = &info->mti_lh[MDT_LH_NEW];
        mdt_lock_reg_init(lh1, LCK_EX);
index a09c368..bf595b3 100755 (executable)
@@ -101,6 +101,8 @@ umask 0077
 $LFS getstripe --help 2>&1 | grep -q stripe-size && LFS_SIZE_OPT="-S"
 
 lfs_migrate() {
+       local RSYNC_MODE=false
+
        while IFS='' read -d '' OLDNAME; do
                $ECHO -n "$OLDNAME: "
 
@@ -130,7 +132,6 @@ lfs_migrate() {
                        continue
                fi
 
-
                if [ "$OPT_RESTRIPE" ]; then
                        UNLINK=""
                else
@@ -147,6 +148,19 @@ lfs_migrate() {
 
                        [ -z "$COUNT" -o -z "$SIZE" ] && UNLINK=""
                fi
+
+               # first try to migrate inside lustre
+               # if failed go back to old rsync mode
+               if [[ $RSYNC_MODE == false ]]; then
+                       $LFS migrate -c${COUNT} ${LFS_SIZE_OPT}${SIZE} $OLDNAME
+                       if [[ $? == 0 ]]; then
+                               $ECHO "done"
+                               continue
+                       else
+                               RSYNC_MODE=true
+                       fi
+               fi
+
                NEWNAME=$(mktemp $UNLINK "$OLDNAME.tmp.XXXXXX")
                if [ $? -ne 0 -o -z "$NEWNAME" ]; then
                        echo -e "\r$OLDNAME: can't make temp file, skipped" 1>&2
index 997edf7..9c071b3 100644 (file)
@@ -89,6 +89,13 @@ check_kernel_version() {
        return 1
 }
 
+check_swap_layouts_support()
+{
+       $LCTL get_param -n llite.*.sbi_flags | grep -q layout ||
+               { skip "Does not support layout lock."; return 0; }
+       return 1
+}
+
 if [ "$ONLY" == "cleanup" ]; then
        sh llmountcleanup.sh
        exit 0
@@ -4403,6 +4410,29 @@ test_56w() {
 }
 run_test 56w "check lfs_migrate -c stripe_count works"
 
+test_56x() {
+       check_swap_layouts_support && return 0
+       [ "$OSTCOUNT" -lt "2" ] &&
+               skip_env "need 2 OST, skipping test" && return
+
+       local dir0=$DIR/$tdir/$testnum
+       mkdir -p $dir0 || error "creating dir $dir0"
+
+       local ref1=/etc/passwd
+       local file1=$dir0/file1
+
+       $SETSTRIPE -c 2 $file1
+       cp $ref1 $file1
+       $LFS migrate -c 1 $file1 || error "migrate failed rc = $?"
+       stripe=$($GETSTRIPE -c $file1)
+       [[ $stripe == 1 ]] || error "stripe of $file1 is $stripe != 1"
+       cmp $file1 $ref1 || error "content mismatch $file1 differs from $ref1"
+
+       # clean up
+       rm -f $file1
+}
+run_test 56x "lfs migration support"
+
 test_57a() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run" && return
        # note test will not do anything if MDS is not local
@@ -9519,43 +9549,6 @@ test_183() { # LU-2275
 }
 run_test 183 "No crash or request leak in case of strange dispositions ========"
 
-test_185() { # LU-2441
-       mkdir -p $DIR/$tdir || error "creating dir $DIR/$tdir"
-       touch $DIR/$tdir/spoo
-       local mtime1=$(stat -c "%Y" $DIR/$tdir)
-       local fid=$($MULTIOP $DIR/$tdir VFw4096c) ||
-               error "cannot create/write a volatile file"
-       $CHECKSTAT -t file $MOUNT/.lustre/fid/$fid 2>/dev/null &&
-               error "FID is still valid after close"
-
-       multiop_bg_pause $DIR/$tdir vVw4096_c
-       local multi_pid=$!
-
-       local OLD_IFS=$IFS
-       IFS=":"
-       local fidv=($fid)
-       IFS=$OLD_IFS
-       # assume that the next FID for this client is sequential, since stdout
-       # is unfortunately eaten by multiop_bg_pause
-       local n=$((${fidv[1]} + 1))
-       local next_fid="${fidv[0]}:$(printf "0x%x" $n):${fidv[2]}"
-       $CHECKSTAT -t file $MOUNT/.lustre/fid/$next_fid ||
-               error "FID is missing before close"
-       kill -USR1 $multi_pid
-       # 1 second delay, so if mtime change we will see it
-       sleep 1
-       local mtime2=$(stat -c "%Y" $DIR/$tdir)
-       [[ $mtime1 == $mtime2 ]] || error "mtime has changed"
-}
-run_test 185 "Volatile file support"
-
-check_swap_layouts_support()
-{
-       $LCTL get_param -n llite.*.sbi_flags | grep -q layout ||
-               { skip "Does not support layout lock."; return 0; }
-       return 1
-}
-
 # test suite 184 is for LU-2016, LU-2017
 test_184a() {
        check_swap_layouts_support && return 0
@@ -9662,6 +9655,36 @@ test_184c() {
 }
 run_test 184c "Concurrent write and layout swap"
 
+test_185() { # LU-2441
+       mkdir -p $DIR/$tdir || error "creating dir $DIR/$tdir"
+       touch $DIR/$tdir/spoo
+       local mtime1=$(stat -c "%Y" $DIR/$tdir)
+       local fid=$($MULTIOP $DIR/$tdir VFw4096c) ||
+               error "cannot create/write a volatile file"
+       $CHECKSTAT -t file $MOUNT/.lustre/fid/$fid 2>/dev/null &&
+               error "FID is still valid after close"
+
+       multiop_bg_pause $DIR/$tdir vVw4096_c
+       local multi_pid=$!
+
+       local OLD_IFS=$IFS
+       IFS=":"
+       local fidv=($fid)
+       IFS=$OLD_IFS
+       # assume that the next FID for this client is sequential, since stdout
+       # is unfortunately eaten by multiop_bg_pause
+       local n=$((${fidv[1]} + 1))
+       local next_fid="${fidv[0]}:$(printf "0x%x" $n):${fidv[2]}"
+       $CHECKSTAT -t file $MOUNT/.lustre/fid/$next_fid ||
+               error "FID is missing before close"
+       kill -USR1 $multi_pid
+       # 1 second delay, so if mtime change we will see it
+       sleep 1
+       local mtime2=$(stat -c "%Y" $DIR/$tdir)
+       [[ $mtime1 == $mtime2 ]] || error "mtime has changed"
+}
+run_test 185 "Volatile file support"
+
 # OST pools tests
 check_file_in_pool()
 {
index 8fef145..95b3a71 100644 (file)
@@ -72,6 +72,8 @@
 #include <lustre/lustreapi.h>
 
 #include <libcfs/libcfsutil.h>
+#include <obd.h>
+#include <obd_lov.h>
 #include "obdctl.h"
 
 /* all functions */
@@ -118,32 +120,37 @@ static int lfs_hsm_remove(int argc, char **argv);
 static int lfs_hsm_cancel(int argc, char **argv);
 static int lfs_swap_layouts(int argc, char **argv);
 
+#define SETSTRIPE_USAGE(_cmd, _tgt) \
+       "usage: "_cmd" [--stripe-count|-c <stripe_count>]\n"\
+       "                 [--stripe-index|-i <start_ost_idx>]\n"\
+       "                 [--stripe-size|-S <stripe_size>]\n"\
+       "                 [--pool|-p <pool_name>]\n"\
+       "                 [--block|-b] "_tgt"\n"\
+       "\tstripe_size:  Number of bytes on each OST (0 filesystem default)\n"\
+       "\t              Can be specified with k, m or g (in KB, MB and GB\n"\
+       "\t              respectively)\n"\
+       "\tstart_ost_idx: OST index of first stripe (-1 default)\n"\
+       "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n"\
+       "\tpool_name:    Name of OST pool to use (default none)\n"\
+       "\tblock:        Block file access during data migration"
+
 /* all avaialable commands */
 command_t cmdlist[] = {
-        {"setstripe", lfs_setstripe, 0,
-         "Create a new file with a specific striping pattern or\n"
-         "set the default striping pattern on an existing directory or\n"
-         "delete the default striping pattern from an existing directory\n"
-         "usage: setstripe [--stripe-count|-c <stripe_count>]\n"
-         "                 [--stripe-index|-i <start_ost_idx>]\n"
-         "                 [--stripe-size|-S <stripe_size>]\n"
-         "                 [--pool|-p <pool_name>] <directory|filename>\n"
-         " or\n"
-         "       setstripe -d <directory>   (to delete default striping)\n"
-         "\tstripe_size:  Number of bytes on each OST (0 filesystem default)\n"
-         "\t              Can be specified with k, m or g (in KB, MB and GB\n"
-         "\t              respectively)\n"
-         "\tstart_ost_idx: OST index of first stripe (-1 default)\n"
-         "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n"
-         "\tpool_name:    Name of OST pool to use (default none)"},
-        {"getstripe", lfs_getstripe, 0,
-         "To list the striping info for a given file or files in a\n"
-         "directory or recursively for all files in a directory tree.\n"
-         "usage: getstripe [--ost|-O <uuid>] [--quiet | -q] [--verbose | -v]\n"
-         "                 [--stripe-count|-c] [--stripe-index|-i]\n"
-         "                 [--pool|-p] [--stripe-size|-S] [--directory|-d]\n"
-         "                 [--mdt-index|-M] [--recursive|-r] [--raw|-R]\n"
-         "                 <directory|filename> ..."},
+       {"setstripe", lfs_setstripe, 0,
+        "Create a new file with a specific striping pattern or\n"
+        "set the default striping pattern on an existing directory or\n"
+        "delete the default striping pattern from an existing directory\n"
+        "usage: setstripe -d <directory>   (to delete default striping)\n"\
+        " or\n"
+        SETSTRIPE_USAGE("setstripe", "<directory|filename>")},
+       {"getstripe", lfs_getstripe, 0,
+        "To list the striping info for a given file or files in a\n"
+        "directory or recursively for all files in a directory tree.\n"
+        "usage: getstripe [--ost|-O <uuid>] [--quiet | -q] [--verbose | -v]\n"
+        "                 [--stripe-count|-c] [--stripe-index|-i]\n"
+        "                 [--pool|-p] [--stripe-size|-S] [--directory|-d]\n"
+        "                 [--mdt-index|-M] [--recursive|-r] [--raw|-R]\n"
+        "                 <directory|filename> ..."},
        {"setdirstripe", lfs_setdirstripe, 0,
         "To create a remote directory on a specified MDT.\n"
         "usage: setdirstripe <--index|-i mdt_index> <dir>\n"
@@ -301,6 +308,9 @@ command_t cmdlist[] = {
         "usage: hsm_cancel [--filelist FILELIST] [--data DATA] <file> ..."},
        {"swap_layouts", lfs_swap_layouts, 0, "Swap layouts between 2 files.\n"
         "usage: swap_layouts <path1> <path2>"},
+       {"migrate", lfs_setstripe, 0, "migrate file from one layout to "
+        "another (may be not safe with concurent writes).\n"
+        SETSTRIPE_USAGE("migrate  ", "<filename>")},
         {"help", Parser_help, 0, "help"},
         {"exit", Parser_quit, 0, "quit"},
         {"quit", Parser_quit, 0, "quit"},
@@ -322,70 +332,292 @@ static int isnumber(const char *str)
         return 1;
 }
 
+#define MIGRATION_BLOCKS 1
+
+static int lfs_migrate(char *name, unsigned long long stripe_size,
+                      int stripe_offset, int stripe_count,
+                      int stripe_pattern, char *pool_name,
+                      __u64 migration_flags)
+{
+       int                      fd, fdv;
+       char                     volatile_file[PATH_MAX];
+       char                     parent[PATH_MAX];
+       char                    *ptr;
+       int                      rc;
+       __u64                    dv1;
+       struct lov_user_md      *lum = NULL;
+       int                      lumsz;
+       int                      bufsz;
+       void                    *buf = NULL;
+       int                      rsize, wsize;
+       __u64                    rpos, wpos, bufoff;
+       int                      gid = 0, sz;
+       int                      have_gl = 0;
+
+       /* find the right size for the IO and allocate the buffer */
+       lumsz = lov_mds_md_size(LOV_MAX_STRIPE_COUNT, LOV_MAGIC_V3);
+       lum = malloc(lumsz);
+       if (lum == NULL) {
+               rc = -ENOMEM;
+               goto free;
+       }
+
+       rc = llapi_file_get_stripe(name, lum);
+       /* failure can come from may case and some may be not real error
+        * (eg: no stripe)
+        * in case of a real error, a later call will failed with a better
+        * error management */
+       if (rc < 0)
+               bufsz = 1024*1024;
+       else
+               bufsz = lum->lmm_stripe_size;
+       rc = posix_memalign(&buf, getpagesize(), bufsz);
+       if (rc != 0) {
+               rc = -rc;
+               goto free;
+       }
+
+       if (migration_flags & MIGRATION_BLOCKS) {
+               /* generate a random id for the grouplock */
+               fd = open("/dev/urandom", O_RDONLY);
+               if (fd == -1) {
+                       rc = -errno;
+                       fprintf(stderr, "cannot open /dev/urandom (%s)\n",
+                               strerror(-rc));
+                       goto free;
+               }
+               sz = sizeof(gid);
+               rc = read(fd, &gid, sz);
+               close(fd);
+               if (rc < sz) {
+                       rc = -errno;
+                       fprintf(stderr, "cannot read %d bytes from"
+                               " /dev/urandom (%s)\n", sz, strerror(-rc));
+                       goto free;
+               }
+       }
+
+       /* search for file directory pathname */
+       strcpy(parent, name);
+       ptr = strrchr(parent, '/');
+       if (ptr == NULL) {
+               if (getcwd(parent, sizeof(parent)) == NULL) {
+                       rc = -errno;
+                       goto free;
+               }
+       } else {
+               if (ptr == parent)
+                       strcpy(parent, "/");
+               else
+                       *ptr = '\0';
+       }
+       sprintf(volatile_file, "%s/%s::", parent, LUSTRE_VOLATILE_HDR);
+
+       /* create, open a volatile file, use caching (ie no directio) */
+       /* exclusive create is not needed because volatile files cannot
+        * conflict on name by construction */
+       fdv = llapi_file_open_pool(volatile_file, O_CREAT | O_WRONLY,
+                                  0644, stripe_size, stripe_offset,
+                                  stripe_count, stripe_pattern, pool_name);
+       if (fdv < 0) {
+               rc = fdv;
+               fprintf(stderr, "cannot create volatile file in %s (%s)\n",
+                       parent, strerror(-rc));
+               goto free;
+       }
+
+       /* open file, direct io */
+       /* even if the file is only read, WR mode is nedeed to allow
+        * layout swap on fd */
+       fd = open(name, O_RDWR | O_DIRECT);
+       if (fd == -1) {
+               rc = -errno;
+               fprintf(stderr, "cannot open %s (%s)\n", name, strerror(-rc));
+               close(fdv);
+               goto free;
+       }
+
+       /* get file data version */
+       rc = llapi_get_data_version(fd, &dv1, 0);
+       if (rc != 0) {
+               fprintf(stderr, "cannot get dataversion on %s (%s)\n",
+                       name, strerror(-rc));
+               goto error;
+       }
+
+       if (migration_flags & MIGRATION_BLOCKS) {
+               /* take group lock to limit concurent access
+                * this will be no more needed when exclusive access will
+                * be implemented (see LU-2919) */
+               /* group lock is taken after data version read because it
+                * blocks data version call */
+               if (ioctl(fd, LL_IOC_GROUP_LOCK, gid) == -1) {
+                       rc = -errno;
+                       fprintf(stderr, "cannot get group lock on %s (%s)\n",
+                               name, strerror(-rc));
+                       goto error;
+               }
+               have_gl = 1;
+       }
+
+       /* copy data */
+       rpos = 0;
+       wpos = 0;
+       bufoff = 0;
+       rsize = -1;
+       do {
+               /* read new data only if we have written all
+                * previously read data */
+               if (wpos == rpos) {
+                       rsize = read(fd, buf, bufsz);
+                       if (rsize < 0) {
+                               rc = -errno;
+                               fprintf(stderr, "read failed on %s"
+                                       " (%s)\n", name,
+                                       strerror(-rc));
+                               goto error;
+                       }
+                       rpos += rsize;
+                       bufoff = 0;
+               }
+               /* eof ? */
+               if (rsize == 0)
+                       break;
+               wsize = write(fdv, buf + bufoff, rpos - wpos);
+               if (wsize < 0) {
+                       rc = -errno;
+                       fprintf(stderr, "write failed on volatile"
+                               " for %s (%s)\n", name, strerror(-rc));
+                       goto error;
+               }
+               wpos += wsize;
+               bufoff += wsize;
+       } while (1);
+
+       /* flush data */
+       fsync(fdv);
+
+       if (migration_flags & MIGRATION_BLOCKS) {
+               /* give back group lock */
+               if (ioctl(fd, LL_IOC_GROUP_UNLOCK, gid) == -1) {
+                       rc = -errno;
+                       fprintf(stderr, "cannot put group lock on %s (%s)\n",
+                               name, strerror(-rc));
+               }
+               have_gl = 0;
+       }
+
+       /* swap layouts
+        * for a migration we need to:
+        * - check data version on file did not change
+        * - keep file mtime
+        * - keep file atime
+        */
+       rc = llapi_fswap_layouts(fd, fdv, dv1, 0,
+                                SWAP_LAYOUTS_CHECK_DV1 |
+                                SWAP_LAYOUTS_KEEP_MTIME |
+                                SWAP_LAYOUTS_KEEP_ATIME);
+       if (rc == -EAGAIN) {
+               fprintf(stderr, "file dataversion for %s has changed"
+                               " during copy, migration is aborted\n",
+                       name);
+               goto error;
+       }
+       if (rc != 0)
+               fprintf(stderr, "cannot swap layouts between %s and "
+                       "a volatile file (%s)\n",
+                       name, strerror(-rc));
+
+error:
+       /* give back group lock */
+       if ((migration_flags & MIGRATION_BLOCKS) && have_gl &&
+           (ioctl(fd, LL_IOC_GROUP_UNLOCK, gid) == -1)) {
+               /* we keep in rc the original error */
+               fprintf(stderr, "cannot put group lock on %s (%s)\n",
+                       name, strerror(-errno));
+       }
+
+       close(fdv);
+       close(fd);
+free:
+       if (lum)
+               free(lum);
+       if (buf)
+               free(buf);
+       return rc;
+}
+
 /* functions */
 static int lfs_setstripe(int argc, char **argv)
 {
-        char *fname;
-        int result;
-        unsigned long long st_size;
-        int  st_offset, st_count;
-        char *end;
-        int c;
-        int delete = 0;
-        char *stripe_size_arg = NULL;
-        char *stripe_off_arg = NULL;
-        char *stripe_count_arg = NULL;
-        char *pool_name_arg = NULL;
-        unsigned long long size_units = 1;
+       char                    *fname;
+       int                      result;
+       unsigned long long       st_size;
+       int                      st_offset, st_count;
+       char                    *end;
+       int                      c;
+       int                      delete = 0;
+       char                    *stripe_size_arg = NULL;
+       char                    *stripe_off_arg = NULL;
+       char                    *stripe_count_arg = NULL;
+       char                    *pool_name_arg = NULL;
+       unsigned long long       size_units = 1;
+       int                      migrate_mode = 0;
+       __u64                    migration_flags = 0;
 
-        struct option long_opts[] = {
+       struct option            long_opts[] = {
+               /* valid only in migrate mode */
+               {"block",        no_argument,       0, 'b'},
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --count option"
 #else
-                /* This formerly implied "stripe-count", but was explicitly
-                 * made "stripe-count" for consistency with other options,
-                 * and to separate it from "mdt-count" when DNE arrives. */
-                {"count",        required_argument, 0, 'c'},
+               /* This formerly implied "stripe-count", but was explicitly
+                * made "stripe-count" for consistency with other options,
+                * and to separate it from "mdt-count" when DNE arrives. */
+               {"count",        required_argument, 0, 'c'},
 #endif
-                {"stripe-count", required_argument, 0, 'c'},
-                {"stripe_count", required_argument, 0, 'c'},
-                {"delete",       no_argument,       0, 'd'},
+               {"stripe-count", required_argument, 0, 'c'},
+               {"stripe_count", required_argument, 0, 'c'},
+               {"delete",       no_argument,       0, 'd'},
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --index option"
 #else
-                /* This formerly implied "stripe-index", but was explicitly
-                 * made "stripe-index" for consistency with other options,
-                 * and to separate it from "mdt-index" when DNE arrives. */
-                {"index",        required_argument, 0, 'i'},
+               /* This formerly implied "stripe-index", but was explicitly
+                * made "stripe-index" for consistency with other options,
+                * and to separate it from "mdt-index" when DNE arrives. */
+               {"index",        required_argument, 0, 'i'},
 #endif
-                {"stripe-index", required_argument, 0, 'i'},
-                {"stripe_index", required_argument, 0, 'i'},
+               {"stripe-index", required_argument, 0, 'i'},
+               {"stripe_index", required_argument, 0, 'i'},
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --offset option"
 #else
-                /* This formerly implied "stripe-index", but was confusing
-                 * with "file offset" (which will eventually be needed for
-                 * with different layouts by offset), so deprecate it. */
-                {"offset",       required_argument, 0, 'o'},
+               /* This formerly implied "stripe-index", but was confusing
+                * with "file offset" (which will eventually be needed for
+                * with different layouts by offset), so deprecate it. */
+               {"offset",       required_argument, 0, 'o'},
 #endif
-                {"pool",         required_argument, 0, 'p'},
+               {"pool",         required_argument, 0, 'p'},
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --size option"
 #else
-                /* This formerly implied "--stripe-size", but was confusing
-                 * with "lfs find --size|-s", which means "file size", so use
-                 * the consistent "--stripe-size|-S" for all commands. */
-                {"size",         required_argument, 0, 's'},
+               /* This formerly implied "--stripe-size", but was confusing
+                * with "lfs find --size|-s", which means "file size", so use
+                * the consistent "--stripe-size|-S" for all commands. */
+               {"size",         required_argument, 0, 's'},
 #endif
-                {"stripe-size",  required_argument, 0, 'S'},
-                {"stripe_size",  required_argument, 0, 'S'},
-                {0, 0, 0, 0}
-        };
+               {"stripe-size",  required_argument, 0, 'S'},
+               {"stripe_size",  required_argument, 0, 'S'},
+               {0, 0, 0, 0}
+       };
 
         st_size = 0;
         st_offset = -1;
         st_count = 0;
 
+       if (strcmp(argv[0], "migrate") == 0)
+               migrate_mode = 1;
+
 #if LUSTRE_VERSION < OBD_OCD_VERSION(2,4,50,0)
         if (argc == 5 && argv[1][0] != '-' &&
             isnumber(argv[2]) && isnumber(argv[3]) && isnumber(argv[4])) {
@@ -404,6 +636,14 @@ static int lfs_setstripe(int argc, char **argv)
                 case 0:
                         /* Long options. */
                         break;
+               case 'b':
+                       if (migrate_mode == 0) {
+                               fprintf(stderr, "--block is valid only for"
+                                               " migrate mode");
+                               return CMD_HELP;
+                       }
+                       migration_flags |= MIGRATION_BLOCKS;
+                       break;
                 case 'c':
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --count option"
@@ -502,18 +742,26 @@ static int lfs_setstripe(int argc, char **argv)
                 }
         }
 
-        do {
-                result = llapi_file_create_pool(fname, st_size, st_offset,
-                                                st_count, 0, pool_name_arg);
-                if (result) {
-                        fprintf(stderr,"error: %s: create stripe file '%s' "
-                                "failed\n", argv[0], fname);
-                        break;
-                }
-                fname = argv[++optind];
-        } while (fname != NULL);
+       do {
+               if (migrate_mode)
+                       result = lfs_migrate(fname, st_size, st_offset,
+                                            st_count, 0, pool_name_arg,
+                                            migration_flags);
+               else
+                       result = llapi_file_create_pool(fname, st_size,
+                                                       st_offset, st_count,
+                                                       0, pool_name_arg);
+               if (result) {
+                       fprintf(stderr,
+                               "error: %s: %s stripe file '%s' failed\n",
+                               argv[0], migrate_mode ? "migrate" : "create",
+                               fname);
+                       break;
+               }
+               fname = argv[++optind];
+       } while (fname != NULL);
 
-        return result;
+       return result;
 }
 
 static int lfs_poollist(int argc, char **argv)
@@ -3315,7 +3563,9 @@ static int lfs_swap_layouts(int argc, char **argv)
        if (argc != 3)
                return CMD_HELP;
 
-       return llapi_swap_layouts(argv[1], argv[2]);
+       return llapi_swap_layouts(argv[1], argv[2], 0, 0,
+                                 SWAP_LAYOUTS_KEEP_MTIME |
+                                 SWAP_LAYOUTS_KEEP_ATIME);
 }
 
 int main(int argc, char **argv)
index ee5bbf1..fb1e69f 100644 (file)
@@ -4144,15 +4144,17 @@ int llapi_create_volatile_idx(char *directory, int idx, int mode)
  * first fd received the ioctl, second fd is passed as arg
  * this is assymetric but avoid use of root path for ioctl
  */
-int llapi_fswap_layouts(int fd1, int fd2)
+int llapi_fswap_layouts(int fd1, int fd2, __u64 dv1, __u64 dv2, __u64 flags)
 {
        struct lustre_swap_layouts lsl;
        int rc;
 
        srandom(time(NULL));
        lsl.sl_fd = fd2;
-       lsl.sl_flags = 0;
+       lsl.sl_flags = flags;
        lsl.sl_gid = random();
+       lsl.sl_dv1 = dv1;
+       lsl.sl_dv2 = dv2;
        rc = ioctl(fd1, LL_IOC_LOV_SWAP_LAYOUTS, &lsl);
        if (rc)
                rc = -errno;
@@ -4163,7 +4165,8 @@ int llapi_fswap_layouts(int fd1, int fd2)
  * Swap the layouts between 2 files
  * the 2 files are open in write
  */
-int llapi_swap_layouts(const char *path1, const char *path2)
+int llapi_swap_layouts(const char *path1, const char *path2,
+                      __u64 dv1, __u64 dv2, __u64 flags)
 {
        int     fd1, fd2, rc;
 
@@ -4184,7 +4187,7 @@ int llapi_swap_layouts(const char *path1, const char *path2)
                return -errno;
        }
 
-       rc = llapi_fswap_layouts(fd1, fd2);
+       rc = llapi_fswap_layouts(fd1, fd2, dv1, dv2, flags);
        if (rc < 0)
                llapi_error(LLAPI_MSG_ERROR, rc,
                        "error: cannot swap layouts between %s and %s\n",