Whamcloud - gitweb
LU-3345 llapi: add user space method for lov_user_md
[fs/lustre-release.git] / lustre / utils / lfs.c
index e180ffb..f258453 100644 (file)
@@ -27,7 +27,7 @@
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2011, 2012, Intel Corporation.
+ * Copyright (c) 2011, 2013, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -72,6 +72,8 @@
 #include <lustre/lustreapi.h>
 
 #include <libcfs/libcfsutil.h>
+#include <obd.h>
+#include <obd_lov.h>
 #include "obdctl.h"
 
 /* all functions */
@@ -116,34 +118,39 @@ static int lfs_hsm_restore(int argc, char **argv);
 static int lfs_hsm_release(int argc, char **argv);
 static int lfs_hsm_remove(int argc, char **argv);
 static int lfs_hsm_cancel(int argc, char **argv);
-
+static int lfs_swap_layouts(int argc, char **argv);
+
+#define SETSTRIPE_USAGE(_cmd, _tgt) \
+       "usage: "_cmd" [--stripe-count|-c <stripe_count>]\n"\
+       "                 [--stripe-index|-i <start_ost_idx>]\n"\
+       "                 [--stripe-size|-S <stripe_size>]\n"\
+       "                 [--pool|-p <pool_name>]\n"\
+       "                 [--block|-b] "_tgt"\n"\
+       "\tstripe_size:  Number of bytes on each OST (0 filesystem default)\n"\
+       "\t              Can be specified with k, m or g (in KB, MB and GB\n"\
+       "\t              respectively)\n"\
+       "\tstart_ost_idx: OST index of first stripe (-1 default)\n"\
+       "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n"\
+       "\tpool_name:    Name of OST pool to use (default none)\n"\
+       "\tblock:        Block file access during data migration"
 
 /* all avaialable commands */
 command_t cmdlist[] = {
-        {"setstripe", lfs_setstripe, 0,
-         "Create a new file with a specific striping pattern or\n"
-         "set the default striping pattern on an existing directory or\n"
-         "delete the default striping pattern from an existing directory\n"
-         "usage: setstripe [--stripe-count|-c <stripe_count>]\n"
-         "                 [--stripe-index|-i <start_ost_idx>]\n"
-         "                 [--stripe-size|-S <stripe_size>]\n"
-         "                 [--pool|-p <pool_name>] <directory|filename>\n"
-         " or\n"
-         "       setstripe -d <directory>   (to delete default striping)\n"
-         "\tstripe_size:  Number of bytes on each OST (0 filesystem default)\n"
-         "\t              Can be specified with k, m or g (in KB, MB and GB\n"
-         "\t              respectively)\n"
-         "\tstart_ost_idx: OST index of first stripe (-1 default)\n"
-         "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n"
-         "\tpool_name:    Name of OST pool to use (default none)"},
-        {"getstripe", lfs_getstripe, 0,
-         "To list the striping info for a given file or files in a\n"
-         "directory or recursively for all files in a directory tree.\n"
-         "usage: getstripe [--ost|-O <uuid>] [--quiet | -q] [--verbose | -v]\n"
-         "                 [--stripe-count|-c] [--stripe-index|-i]\n"
-         "                 [--pool|-p] [--stripe-size|-S] [--directory|-d]\n"
-         "                 [--mdt-index|-M] [--recursive|-r] [--raw|-R]\n"
-         "                 <directory|filename> ..."},
+       {"setstripe", lfs_setstripe, 0,
+        "Create a new file with a specific striping pattern or\n"
+        "set the default striping pattern on an existing directory or\n"
+        "delete the default striping pattern from an existing directory\n"
+        "usage: setstripe -d <directory>   (to delete default striping)\n"\
+        " or\n"
+        SETSTRIPE_USAGE("setstripe", "<directory|filename>")},
+       {"getstripe", lfs_getstripe, 0,
+        "To list the striping info for a given file or files in a\n"
+        "directory or recursively for all files in a directory tree.\n"
+        "usage: getstripe [--ost|-O <uuid>] [--quiet | -q] [--verbose | -v]\n"
+        "                 [--stripe-count|-c] [--stripe-index|-i]\n"
+        "                 [--pool|-p] [--stripe-size|-S] [--directory|-d]\n"
+        "                 [--mdt-index|-M] [--recursive|-r] [--raw|-R]\n"
+        "                 <directory|filename> ..."},
        {"setdirstripe", lfs_setdirstripe, 0,
         "To create a remote directory on a specified MDT.\n"
         "usage: setdirstripe <--index|-i mdt_index> <dir>\n"
@@ -299,6 +306,11 @@ command_t cmdlist[] = {
        {"hsm_cancel", lfs_hsm_cancel, 0,
         "Cancel requests related to specified files.\n"
         "usage: hsm_cancel [--filelist FILELIST] [--data DATA] <file> ..."},
+       {"swap_layouts", lfs_swap_layouts, 0, "Swap layouts between 2 files.\n"
+        "usage: swap_layouts <path1> <path2>"},
+       {"migrate", lfs_setstripe, 0, "migrate file from one layout to "
+        "another (may be not safe with concurent writes).\n"
+        SETSTRIPE_USAGE("migrate  ", "<filename>")},
         {"help", Parser_help, 0, "help"},
         {"exit", Parser_quit, 0, "quit"},
         {"quit", Parser_quit, 0, "quit"},
@@ -320,70 +332,292 @@ static int isnumber(const char *str)
         return 1;
 }
 
+#define MIGRATION_BLOCKS 1
+
+static int lfs_migrate(char *name, unsigned long long stripe_size,
+                      int stripe_offset, int stripe_count,
+                      int stripe_pattern, char *pool_name,
+                      __u64 migration_flags)
+{
+       int                      fd, fdv;
+       char                     volatile_file[PATH_MAX];
+       char                     parent[PATH_MAX];
+       char                    *ptr;
+       int                      rc;
+       __u64                    dv1;
+       struct lov_user_md      *lum = NULL;
+       int                      lumsz;
+       int                      bufsz;
+       void                    *buf = NULL;
+       int                      rsize, wsize;
+       __u64                    rpos, wpos, bufoff;
+       int                      gid = 0, sz;
+       int                      have_gl = 0;
+
+       /* find the right size for the IO and allocate the buffer */
+       lumsz = lov_user_md_size(LOV_MAX_STRIPE_COUNT, LOV_USER_MAGIC_V3);
+       lum = malloc(lumsz);
+       if (lum == NULL) {
+               rc = -ENOMEM;
+               goto free;
+       }
+
+       rc = llapi_file_get_stripe(name, lum);
+       /* failure can come from may case and some may be not real error
+        * (eg: no stripe)
+        * in case of a real error, a later call will failed with a better
+        * error management */
+       if (rc < 0)
+               bufsz = 1024*1024;
+       else
+               bufsz = lum->lmm_stripe_size;
+       rc = posix_memalign(&buf, getpagesize(), bufsz);
+       if (rc != 0) {
+               rc = -rc;
+               goto free;
+       }
+
+       if (migration_flags & MIGRATION_BLOCKS) {
+               /* generate a random id for the grouplock */
+               fd = open("/dev/urandom", O_RDONLY);
+               if (fd == -1) {
+                       rc = -errno;
+                       fprintf(stderr, "cannot open /dev/urandom (%s)\n",
+                               strerror(-rc));
+                       goto free;
+               }
+               sz = sizeof(gid);
+               rc = read(fd, &gid, sz);
+               close(fd);
+               if (rc < sz) {
+                       rc = -errno;
+                       fprintf(stderr, "cannot read %d bytes from"
+                               " /dev/urandom (%s)\n", sz, strerror(-rc));
+                       goto free;
+               }
+       }
+
+       /* search for file directory pathname */
+       strcpy(parent, name);
+       ptr = strrchr(parent, '/');
+       if (ptr == NULL) {
+               if (getcwd(parent, sizeof(parent)) == NULL) {
+                       rc = -errno;
+                       goto free;
+               }
+       } else {
+               if (ptr == parent)
+                       strcpy(parent, "/");
+               else
+                       *ptr = '\0';
+       }
+       sprintf(volatile_file, "%s/%s::", parent, LUSTRE_VOLATILE_HDR);
+
+       /* create, open a volatile file, use caching (ie no directio) */
+       /* exclusive create is not needed because volatile files cannot
+        * conflict on name by construction */
+       fdv = llapi_file_open_pool(volatile_file, O_CREAT | O_WRONLY,
+                                  0644, stripe_size, stripe_offset,
+                                  stripe_count, stripe_pattern, pool_name);
+       if (fdv < 0) {
+               rc = fdv;
+               fprintf(stderr, "cannot create volatile file in %s (%s)\n",
+                       parent, strerror(-rc));
+               goto free;
+       }
+
+       /* open file, direct io */
+       /* even if the file is only read, WR mode is nedeed to allow
+        * layout swap on fd */
+       fd = open(name, O_RDWR | O_DIRECT);
+       if (fd == -1) {
+               rc = -errno;
+               fprintf(stderr, "cannot open %s (%s)\n", name, strerror(-rc));
+               close(fdv);
+               goto free;
+       }
+
+       /* get file data version */
+       rc = llapi_get_data_version(fd, &dv1, 0);
+       if (rc != 0) {
+               fprintf(stderr, "cannot get dataversion on %s (%s)\n",
+                       name, strerror(-rc));
+               goto error;
+       }
+
+       if (migration_flags & MIGRATION_BLOCKS) {
+               /* take group lock to limit concurent access
+                * this will be no more needed when exclusive access will
+                * be implemented (see LU-2919) */
+               /* group lock is taken after data version read because it
+                * blocks data version call */
+               if (ioctl(fd, LL_IOC_GROUP_LOCK, gid) == -1) {
+                       rc = -errno;
+                       fprintf(stderr, "cannot get group lock on %s (%s)\n",
+                               name, strerror(-rc));
+                       goto error;
+               }
+               have_gl = 1;
+       }
+
+       /* copy data */
+       rpos = 0;
+       wpos = 0;
+       bufoff = 0;
+       rsize = -1;
+       do {
+               /* read new data only if we have written all
+                * previously read data */
+               if (wpos == rpos) {
+                       rsize = read(fd, buf, bufsz);
+                       if (rsize < 0) {
+                               rc = -errno;
+                               fprintf(stderr, "read failed on %s"
+                                       " (%s)\n", name,
+                                       strerror(-rc));
+                               goto error;
+                       }
+                       rpos += rsize;
+                       bufoff = 0;
+               }
+               /* eof ? */
+               if (rsize == 0)
+                       break;
+               wsize = write(fdv, buf + bufoff, rpos - wpos);
+               if (wsize < 0) {
+                       rc = -errno;
+                       fprintf(stderr, "write failed on volatile"
+                               " for %s (%s)\n", name, strerror(-rc));
+                       goto error;
+               }
+               wpos += wsize;
+               bufoff += wsize;
+       } while (1);
+
+       /* flush data */
+       fsync(fdv);
+
+       if (migration_flags & MIGRATION_BLOCKS) {
+               /* give back group lock */
+               if (ioctl(fd, LL_IOC_GROUP_UNLOCK, gid) == -1) {
+                       rc = -errno;
+                       fprintf(stderr, "cannot put group lock on %s (%s)\n",
+                               name, strerror(-rc));
+               }
+               have_gl = 0;
+       }
+
+       /* swap layouts
+        * for a migration we need to:
+        * - check data version on file did not change
+        * - keep file mtime
+        * - keep file atime
+        */
+       rc = llapi_fswap_layouts(fd, fdv, dv1, 0,
+                                SWAP_LAYOUTS_CHECK_DV1 |
+                                SWAP_LAYOUTS_KEEP_MTIME |
+                                SWAP_LAYOUTS_KEEP_ATIME);
+       if (rc == -EAGAIN) {
+               fprintf(stderr, "file dataversion for %s has changed"
+                               " during copy, migration is aborted\n",
+                       name);
+               goto error;
+       }
+       if (rc != 0)
+               fprintf(stderr, "cannot swap layouts between %s and "
+                       "a volatile file (%s)\n",
+                       name, strerror(-rc));
+
+error:
+       /* give back group lock */
+       if ((migration_flags & MIGRATION_BLOCKS) && have_gl &&
+           (ioctl(fd, LL_IOC_GROUP_UNLOCK, gid) == -1)) {
+               /* we keep in rc the original error */
+               fprintf(stderr, "cannot put group lock on %s (%s)\n",
+                       name, strerror(-errno));
+       }
+
+       close(fdv);
+       close(fd);
+free:
+       if (lum)
+               free(lum);
+       if (buf)
+               free(buf);
+       return rc;
+}
+
 /* functions */
 static int lfs_setstripe(int argc, char **argv)
 {
-        char *fname;
-        int result;
-        unsigned long long st_size;
-        int  st_offset, st_count;
-        char *end;
-        int c;
-        int delete = 0;
-        char *stripe_size_arg = NULL;
-        char *stripe_off_arg = NULL;
-        char *stripe_count_arg = NULL;
-        char *pool_name_arg = NULL;
-        unsigned long long size_units = 1;
+       char                    *fname;
+       int                      result;
+       unsigned long long       st_size;
+       int                      st_offset, st_count;
+       char                    *end;
+       int                      c;
+       int                      delete = 0;
+       char                    *stripe_size_arg = NULL;
+       char                    *stripe_off_arg = NULL;
+       char                    *stripe_count_arg = NULL;
+       char                    *pool_name_arg = NULL;
+       unsigned long long       size_units = 1;
+       int                      migrate_mode = 0;
+       __u64                    migration_flags = 0;
 
-        struct option long_opts[] = {
+       struct option            long_opts[] = {
+               /* valid only in migrate mode */
+               {"block",        no_argument,       0, 'b'},
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --count option"
 #else
-                /* This formerly implied "stripe-count", but was explicitly
-                 * made "stripe-count" for consistency with other options,
-                 * and to separate it from "mdt-count" when DNE arrives. */
-                {"count",        required_argument, 0, 'c'},
+               /* This formerly implied "stripe-count", but was explicitly
+                * made "stripe-count" for consistency with other options,
+                * and to separate it from "mdt-count" when DNE arrives. */
+               {"count",        required_argument, 0, 'c'},
 #endif
-                {"stripe-count", required_argument, 0, 'c'},
-                {"stripe_count", required_argument, 0, 'c'},
-                {"delete",       no_argument,       0, 'd'},
+               {"stripe-count", required_argument, 0, 'c'},
+               {"stripe_count", required_argument, 0, 'c'},
+               {"delete",       no_argument,       0, 'd'},
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --index option"
 #else
-                /* This formerly implied "stripe-index", but was explicitly
-                 * made "stripe-index" for consistency with other options,
-                 * and to separate it from "mdt-index" when DNE arrives. */
-                {"index",        required_argument, 0, 'i'},
+               /* This formerly implied "stripe-index", but was explicitly
+                * made "stripe-index" for consistency with other options,
+                * and to separate it from "mdt-index" when DNE arrives. */
+               {"index",        required_argument, 0, 'i'},
 #endif
-                {"stripe-index", required_argument, 0, 'i'},
-                {"stripe_index", required_argument, 0, 'i'},
+               {"stripe-index", required_argument, 0, 'i'},
+               {"stripe_index", required_argument, 0, 'i'},
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --offset option"
 #else
-                /* This formerly implied "stripe-index", but was confusing
-                 * with "file offset" (which will eventually be needed for
-                 * with different layouts by offset), so deprecate it. */
-                {"offset",       required_argument, 0, 'o'},
+               /* This formerly implied "stripe-index", but was confusing
+                * with "file offset" (which will eventually be needed for
+                * with different layouts by offset), so deprecate it. */
+               {"offset",       required_argument, 0, 'o'},
 #endif
-                {"pool",         required_argument, 0, 'p'},
+               {"pool",         required_argument, 0, 'p'},
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --size option"
 #else
-                /* This formerly implied "--stripe-size", but was confusing
-                 * with "lfs find --size|-s", which means "file size", so use
-                 * the consistent "--stripe-size|-S" for all commands. */
-                {"size",         required_argument, 0, 's'},
+               /* This formerly implied "--stripe-size", but was confusing
+                * with "lfs find --size|-s", which means "file size", so use
+                * the consistent "--stripe-size|-S" for all commands. */
+               {"size",         required_argument, 0, 's'},
 #endif
-                {"stripe-size",  required_argument, 0, 'S'},
-                {"stripe_size",  required_argument, 0, 'S'},
-                {0, 0, 0, 0}
-        };
+               {"stripe-size",  required_argument, 0, 'S'},
+               {"stripe_size",  required_argument, 0, 'S'},
+               {0, 0, 0, 0}
+       };
 
         st_size = 0;
         st_offset = -1;
         st_count = 0;
 
+       if (strcmp(argv[0], "migrate") == 0)
+               migrate_mode = 1;
+
 #if LUSTRE_VERSION < OBD_OCD_VERSION(2,4,50,0)
         if (argc == 5 && argv[1][0] != '-' &&
             isnumber(argv[2]) && isnumber(argv[3]) && isnumber(argv[4])) {
@@ -402,6 +636,14 @@ static int lfs_setstripe(int argc, char **argv)
                 case 0:
                         /* Long options. */
                         break;
+               case 'b':
+                       if (migrate_mode == 0) {
+                               fprintf(stderr, "--block is valid only for"
+                                               " migrate mode");
+                               return CMD_HELP;
+                       }
+                       migration_flags |= MIGRATION_BLOCKS;
+                       break;
                 case 'c':
 #if LUSTRE_VERSION >= OBD_OCD_VERSION(2,9,50,0)
 #warning "remove deprecated --count option"
@@ -500,18 +742,26 @@ static int lfs_setstripe(int argc, char **argv)
                 }
         }
 
-        do {
-                result = llapi_file_create_pool(fname, st_size, st_offset,
-                                                st_count, 0, pool_name_arg);
-                if (result) {
-                        fprintf(stderr,"error: %s: create stripe file '%s' "
-                                "failed\n", argv[0], fname);
-                        break;
-                }
-                fname = argv[++optind];
-        } while (fname != NULL);
+       do {
+               if (migrate_mode)
+                       result = lfs_migrate(fname, st_size, st_offset,
+                                            st_count, 0, pool_name_arg,
+                                            migration_flags);
+               else
+                       result = llapi_file_create_pool(fname, st_size,
+                                                       st_offset, st_count,
+                                                       0, pool_name_arg);
+               if (result) {
+                       fprintf(stderr,
+                               "error: %s: %s stripe file '%s' failed\n",
+                               argv[0], migrate_mode ? "migrate" : "create",
+                               fname);
+                       break;
+               }
+               fname = argv[++optind];
+       } while (fname != NULL);
 
-        return result;
+       return result;
 }
 
 static int lfs_poollist(int argc, char **argv)
@@ -2899,7 +3149,7 @@ static int lfs_hsm_state(int argc, char **argv)
                if (rc) {
                        fprintf(stderr, "can't get hsm state for %s: %s\n",
                                path, strerror(errno = -rc));
-                       return rc;
+                       return rc;
                }
 
                /* Display path name and status flags */
@@ -3091,7 +3341,7 @@ static int lfs_hsm_prepare_file(char *file, struct lu_fid *fid,
 
        rc = lstat(file, &st);
        if (rc) {
-               fprintf(stderr, "Cannot stat %s: %s\n", file, strerror(-errno));
+               fprintf(stderr, "Cannot stat %s: %s\n", file, strerror(errno));
                return -errno;
        }
        /* A request should be ... */
@@ -3308,6 +3558,16 @@ static int lfs_hsm_cancel(int argc, char **argv)
        return lfs_hsm_request(argc, argv, HUA_CANCEL);
 }
 
+static int lfs_swap_layouts(int argc, char **argv)
+{
+       if (argc != 3)
+               return CMD_HELP;
+
+       return llapi_swap_layouts(argv[1], argv[2], 0, 0,
+                                 SWAP_LAYOUTS_KEEP_MTIME |
+                                 SWAP_LAYOUTS_KEEP_ATIME);
+}
+
 int main(int argc, char **argv)
 {
         int rc;