Whamcloud - gitweb
b=14836
authornathan <nathan>
Mon, 15 Sep 2008 23:25:34 +0000 (23:25 +0000)
committernathan <nathan>
Mon, 15 Sep 2008 23:25:34 +0000 (23:25 +0000)
i=nathan
i=adilger
OST pools on HEAD, comprehensive patch including 17054:19007;
16935:18918,19012,19089,19128; 16978:18872

53 files changed:
lustre/ChangeLog
lustre/doc/lfs.1
lustre/include/lprocfs_status.h
lustre/include/lustre/liblustreapi.h
lustre/include/lustre/lustre_idl.h
lustre/include/lustre/lustre_user.h
lustre/include/lustre_cfg.h
lustre/include/lustre_lib.h
lustre/include/obd.h
lustre/include/obd_class.h
lustre/include/obd_lov.h
lustre/ldlm/ldlm_lib.c
lustre/liblustre/super.c
lustre/llite/dir.c
lustre/llite/file.c
lustre/llite/llite_lib.c
lustre/lov/Makefile.in
lustre/lov/autoMakefile.am
lustre/lov/lov_ea.c
lustre/lov/lov_internal.h
lustre/lov/lov_obd.c
lustre/lov/lov_pack.c
lustre/lov/lov_pool.c [new file with mode: 0644]
lustre/lov/lov_qos.c
lustre/mdd/mdd_lov.c
lustre/mdd/mdd_trans.c
lustre/mds/handler.c
lustre/mds/mds_internal.h
lustre/mds/mds_lov.c
lustre/mgs/mgs_handler.c
lustre/mgs/mgs_internal.h
lustre/mgs/mgs_llog.c
lustre/obdclass/debug.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/obd_config.c
lustre/obdfilter/filter.c
lustre/osc/osc_request.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/ptlrpc_module.c
lustre/tests/ll_dirstripe_verify.c
lustre/tests/ll_getstripe_info.c
lustre/tests/llmount.sh
lustre/tests/sanity.sh
lustre/tests/test-framework.sh
lustre/utils/Makefile.am
lustre/utils/lctl.c
lustre/utils/lfs.c
lustre/utils/liblustreapi.c
lustre/utils/llog_reader.c
lustre/utils/obd.c
lustre/utils/obdctl.h
lustre/utils/req-layout.c
lustre/utils/wirecheck.c

index 4141ff9..4ecc54e 100644 (file)
@@ -95,6 +95,10 @@ Details    : When connection is reused this not moved from CONN_UNUSED_HASH
             again in unused hash.
 
 Severity   : enhancement
+Bugzilla   : 15899
+Description: File striping can now be set to use an arbitrary pool of OSTs.
+       
+Severity   : enhancement
 Bugzilla   : 16573
 Description: Export bytes_read/bytes_write count on OSC/OST.
 
index 0fca528..cea475f 100644 (file)
@@ -17,18 +17,29 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the
         \fB[[!] --uid|-u N] [[!] --user|-U <name>]
         \fB<dirname|filename>\fR
 .br
+.B lfs osts
+.br
 .B lfs getstripe [--obd|-O <uuid>] [--quiet|-q] [--verbose|-v] 
-              \fB[--recursive|-r] <dirname|filename>\fR
+        \fB[--recursive|-r] <dirname|filename>\fR
 .br
 .B lfs setstripe [--size|-s stripe-size] [--count|-c stripe-cnt]
-              \fB[--index|-i start-ost] <filename|dirname>\fR
+        \fB[--offset|-o start-ost] [--pool|-p pool-name]
+        \fB<dir|filename>\fR
 .br
-.B lfs setstripe -d <dirname>
+.B lfs setstripe -d <dir>
 .br
-.B lfs quotachown [-i] <filesystem>
+.B lfs poollist <filesystem>[.<pool>] | <pathname>
+.br
+.B lfs quota [-v] [-o obd_uuid] [-u|-g] <username|groupname> <filesystem>
+.br
+.B lfs quota <filesystem>
+.br
+.B lfs quota -t [-u|-g] <filesystem>
 .br
 .B lfs quotacheck [-ug] <filesystem>
 .br
+.B lfs quotachown [-i] <filesystem>
+.br
 .B lfs quotaon [-ugf] <filesystem>
 .br
 .B lfs quotaoff [-ug] <filesystem>
@@ -59,13 +70,15 @@ Report filesystem disk space usage or inodes usage of each MDT/OST.
 .B find 
 To search the directory tree rooted at the given dir/file name for the files that match the given parameters: \fB--atime\fR (file was last accessed N*24 hours ago), \fB--ctime\fR (file's status was last changed N*24 hours ago), \fB--mtime\fR (file's data was last modified N*24 hours ago), \fB--obd\fR (file has an object on a specific OST or OSTs), \fB--size\fR (file has size in bytes, or \fBk\fRilo-, \fBM\fRega-, \fBG\fRiga-, \fBT\fRera-, \fBP\fReta-, or \fBE\fRxabytes if a suffix is given), \fB--type\fR (file has the type: \fBb\fRlock, \fBc\fRharacter, \fBd\fRirectory, \fBp\fRipe, \fBf\fRile, sym\fBl\fRink, \fBs\fRocket, or \fBD\fRoor (Solaris)), \fB--uid\fR (file has specific numeric user ID), \fB--user\fR (file owned by specific user, numeric user ID allowed), \fB--gid\fR (file has specific group ID), \fB--group\fR (file belongs to specific group, numeric group ID allowed). The option \fB--maxdepth\fR allows find to decend at most N levels of directory tree. The options \fB--print\fR and \fB--print0\fR print full file name, followed by a newline or NUL character correspondingly.  Using \fB!\fR before an option negates its meaning (\fIfiles NOT matching the parameter\fR).  Using \fB+\fR before a numeric value means \fIfiles with the parameter OR MORE\fR, while \fB-\fR before a numeric value means \fIfiles with the parameter OR LESS\fR.
 .TP
-.B getstripe
-To list the striping info for a given filename or files in a directory, optionally recursively, for all files in a directory tree: \fB--quiet\fR (don't print object IDs), \fB--verbose\fR (print striping parameters), \fB--recursive\fR (recurse into subdirectories).
-.TP
 .B osts 
 List all the OSTs for the filesystem
 .TP
-.B setstripe [--size stripe-size] [--count stripe-cnt] [--index start-ost]
+.B getstripe
+To list the striping info for a given filename or files in a directory, optionally recursively, for all files in a directory tree: \fB--quiet\fR (don't print object IDs), \fB--verbose\fR (print striping parameters), \fB--recursive\fR (recurse into subdirectories).
+.TP
+.B setstripe [--size stripe-size] [--count stripe-cnt] 
+       \fB[--offset start-ost] [--pool pool-name]\fR
+.br
 To create a new file, or set the directory default, with the specified striping parameters.  The
 .I stripe-count
 is the number of OSTs to stripe a file over. A
@@ -78,15 +91,24 @@ is the number of bytes to store on each OST before moving to the next OST.  A
 .I stripe-size
 of 0 means to use the filesystem-wide default stripe size (default 1MB).  The
 .I start-ost
-is the OST index (starting at 0) on which to start striping for this file.  A
+is the OST index (base 10, starting at 0) on which to start striping for this file.  A
 .I start-ost
-of -1 allows the MDS to specify the starting index and it is strongly
-recommended that the starting OST not be given, as this allows space and
-load balancing to be done by the MDS as needed.
+of -1 allows the MDS to choose the starting index and it is strongly recommended, as this allows space and load balancing to be done by the MDS as needed.  The
+.I pool-name
+is the name of a predefined pool of OSTs (see 
+.I lctl
+) that will be used for striping. The 
+.I stripe-count, stripe-size, start-ost
+will be used as well; the 
+.I start-ost
+must be part of the pool or an error will be returned. 
 .TP
-.B lfs setstripe -d
+.B setstripe -d
 Delete the default striping on the specified directory.
 .TP
+.B poollist <filesystem>[.<pool>] | <pathname>
+List the pools in \fBfilesystem\fR or \fBpathname\fR, or the OSTs in \fBfilesystem.pool\fR
+.TP
 .B quotachown
 To change files' owner and group on OSTs of the specified filesystem
 .TP
index c3e69c7..9696a7b 100644 (file)
@@ -403,9 +403,12 @@ extern int lprocfs_add_clear_entry(struct obd_device * obd,
 extern int lprocfs_exp_setup(struct obd_export *exp,
                              lnet_nid_t *peer_nid, int *newnid);
 extern int lprocfs_exp_cleanup(struct obd_export *exp);
-extern int lprocfs_add_simple(struct proc_dir_entry *root,
-                              char *name, cfs_read_proc_t *read_proc,
-                              cfs_write_proc_t *write_proc, void *data);
+extern cfs_proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+                                                char *name,
+                                                cfs_read_proc_t *read_proc,
+                                                cfs_write_proc_t *write_proc,
+                                                void *data,
+                                                struct file_operations *fops);
 extern struct proc_dir_entry *lprocfs_add_symlink(const char *name,
                         struct proc_dir_entry *parent, const char *dest);
 extern void lprocfs_free_per_client_stats(struct obd_device *obd);
@@ -436,10 +439,6 @@ extern cfs_proc_dir_entry_t *lprocfs_srch(cfs_proc_dir_entry_t *root,
 
 extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list);
 extern int lprocfs_obd_cleanup(struct obd_device *obd);
-extern int lprocfs_add_simple(struct proc_dir_entry *root, char *name,
-                              cfs_read_proc_t *read_proc,
-                              cfs_write_proc_t *write_proc,
-                              void *data);
 extern void lprocfs_free_per_client_stats(struct obd_device *obd);
 extern struct file_operations lprocfs_evict_client_fops;
 
@@ -658,11 +657,12 @@ static inline int lprocfs_exp_setup(struct obd_export *exp,
 { return 0; }
 static inline int lprocfs_exp_cleanup(struct obd_export *exp)
 { return 0; }
-static inline int lprocfs_add_simple(struct proc_dir_entry *root,
+static inline cfs_proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
                                      char *name,
                                      cfs_read_proc_t *read_proc,
                                      cfs_write_proc_t *write_proc,
-                                     void *data)
+                                     void *data,
+                                     struct file_operations *fops)
 {return 0; }
 static inline struct proc_dir_entry *lprocfs_add_symlink(const char *name,
                         struct proc_dir_entry *parent, const char *dest)
index 24857e4..e6c1e43 100644 (file)
@@ -73,6 +73,14 @@ extern int llapi_file_create(const char *name, unsigned long stripe_size,
 extern int llapi_file_open(const char *name, int flags, int mode,
                            unsigned long stripe_size, int stripe_offset,
                            int stripe_count, int stripe_pattern);
+extern int llapi_file_create_pool(const char *name, unsigned long stripe_size,
+                                  int stripe_offset, int stripe_count,
+                                  int stripe_pattern, char *pool_name);
+extern int llapi_file_open_pool(const char *name, int flags, int mode,
+                                unsigned long stripe_size, int stripe_offset,
+                                int stripe_count, int stripe_pattern,
+                                char *pool_name);
+extern int llapi_poollist(char *name);
 extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum);
 #define HAVE_LLAPI_FILE_LOOKUP
 extern int llapi_file_lookup(int dirfd, const char *name);
@@ -102,7 +110,9 @@ struct find_param {
                         exclude_gid:1,
                         exclude_uid:1,
                         check_gid:1,
-                        check_uid:1;
+                        check_uid:1,
+                        check_pool:1,
+                        exclude_pool:1;
 
         int     verbose;
         int     quiet;
@@ -124,6 +134,8 @@ struct find_param {
         /* In-precess parameters. */
         unsigned int depth;
         dev_t   st_dev;
+
+        char poolname[MAXPOOLNAME+1];
 };
 
 extern int llapi_getstripe(char *path, struct find_param *param);
@@ -136,7 +148,7 @@ extern int llapi_ping(char *obd_type, char *obd_name);
 extern int llapi_target_check(int num_types, char **obd_types, char *dir);
 extern int llapi_catinfo(char *dir, char *keyword, char *node_name);
 extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid);
-extern int llapi_file_get_lov_fuuid(int fd, struct obd_uuid *lov_uuid);
+extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid);
 extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count);
 extern int llapi_is_lustre_mnttype(const char *type);
 extern int parse_size(char *optarg, unsigned long long *size,
index 4f09b84..942ec44 100644 (file)
@@ -607,6 +607,7 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                               *b=10600 */
 #define OBD_CONNECT_CKSUM      0x20000000ULL /* support several cksum algos */
 #define OBD_CONNECT_FID        0x40000000ULL /* FID is supported by server */
+#define OBD_CONNECT_LOV_V3    0x100000000ULL /* client supports lov v3 ea */
 
 /* also update obd_connect_names[] for lprocfs_rd_connect_flags()
  * and lustre/utils/wirecheck.c */
@@ -626,7 +627,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb);
                                 OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | \
                                 OBD_CONNECT_MDS_MDS | OBD_CONNECT_CANCELSET | \
                                 OBD_CONNECT_FID | \
-                                LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_AT)
+                                LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_AT | \
+                                OBD_CONNECT_LOV_V3)
 #define OST_CONNECT_SUPPORTED  (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \
                                 OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \
                                 OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \
@@ -748,6 +750,7 @@ typedef __u32 obd_count;
 #define LOV_MAGIC_V1      0x0BD10BD0
 #define LOV_MAGIC         LOV_MAGIC_V1
 #define LOV_MAGIC_JOIN    0x0BD20BD0
+#define LOV_MAGIC_V3      0x0BD30BD0
 
 #define LOV_PATTERN_RAID0 0x001   /* stripes are used round-robin */
 #define LOV_PATTERN_RAID1 0x002   /* stripes are mirrors of each other */
@@ -757,6 +760,9 @@ typedef __u32 obd_count;
 #define LOV_OBJECT_GROUP_DEFAULT ~0ULL
 #define LOV_OBJECT_GROUP_CLEAR 0ULL
 
+#define MAXPOOLNAME 16
+#define POOLNAMEF "%.16s"
+
 #define lov_ost_data lov_ost_data_v1
 struct lov_ost_data_v1 {          /* per-stripe data structure (little-endian)*/
         __u64 l_object_id;        /* OST object ID */
@@ -776,7 +782,7 @@ struct lov_mds_md_v1 {            /* LOV EA mds/wire data (little-endian) */
         struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
 };
 
-extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm);
+/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */
 
 #define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data))
 #define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data))
@@ -785,6 +791,18 @@ extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm);
 #define XATTR_NAME_ACL_DEFAULT  "system.posix_acl_default"
 #define XATTR_NAME_LOV          "trusted.lov"
 
+struct lov_mds_md_v3 {            /* LOV EA mds/wire data (little-endian) */
+        __u32 lmm_magic;          /* magic number = LOV_MAGIC_V3 */
+        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+        __u64 lmm_object_id;      /* LOV object ID */
+        __u64 lmm_object_gr;      /* LOV object group */
+        __u32 lmm_stripe_size;    /* size of stripe in bytes */
+        __u32 lmm_stripe_count;   /* num stripes in use for this object */
+        char  lmm_pool_name[MAXPOOLNAME]; /* must be 32bit aligned */
+        struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+};
+
+
 #define OBD_MD_FLID        (0x00000001ULL) /* object ID */
 #define OBD_MD_FLATIME     (0x00000002ULL) /* access time */
 #define OBD_MD_FLMTIME     (0x00000004ULL) /* data modification time */
@@ -2151,8 +2169,10 @@ extern void lustre_swab_ost_body (struct ost_body *b);
 extern void lustre_swab_ost_last_id(obd_id *id);
 extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap);
 
-extern void lustre_swab_lov_user_md(struct lov_user_md *lum);
-extern void lustre_swab_lov_user_md_objects(struct lov_user_md *lum);
+extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum);
+extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum);
+extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                            int stripe_count);
 extern void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj);
 
 /* llog_swab.c */
index 2fd8bc5..ed40b0e 100644 (file)
@@ -129,13 +129,15 @@ struct obd_statfs;
 
 #define LOV_USER_MAGIC_V1 0x0BD10BD0
 #define LOV_USER_MAGIC    LOV_USER_MAGIC_V1
-
 #define LOV_USER_MAGIC_JOIN 0x0BD20BD0
+#define LOV_USER_MAGIC_V3 0x0BD30BD0
 
 #define LOV_PATTERN_RAID0 0x001
 #define LOV_PATTERN_RAID1 0x002
 #define LOV_PATTERN_FIRST 0x100
 
+#define MAXPOOLNAME 16
+
 #define lov_user_ost_data lov_user_ost_data_v1
 struct lov_user_ost_data_v1 {     /* per-stripe data structure */
         __u64 l_object_id;        /* OST object ID */
@@ -156,6 +158,18 @@ struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
         struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
 } __attribute__((packed));
 
+struct lov_user_md_v3 {           /* LOV EA user data (host-endian) */
+        __u32 lmm_magic;          /* magic number = LOV_USER_MAGIC_V3 */
+        __u32 lmm_pattern;        /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */
+        __u64 lmm_object_id;      /* LOV object ID */
+        __u64 lmm_object_gr;      /* LOV object group */
+        __u32 lmm_stripe_size;    /* size of stripe in bytes */
+        __u16 lmm_stripe_count;   /* num stripes in use for this object */
+        __u16 lmm_stripe_offset;  /* starting stripe offset in lmm_objects */
+        char  lmm_pool_name[MAXPOOLNAME]; /* pool name */
+        struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */
+} __attribute__((packed));
+
 /* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to
  * use this.  It is unsafe to #define those values in this header as it
  * is possible the application has already #included <sys/stat.h>. */
@@ -163,7 +177,12 @@ struct lov_user_md_v1 {           /* LOV EA user data (host-endian) */
 #define lov_user_mds_data lov_user_mds_data_v1
 struct lov_user_mds_data_v1 {
         lstat_t lmd_st;                 /* MDS stat struct */
-        struct lov_user_md_v1 lmd_lmm;  /* LOV EA user data */
+        struct lov_user_md_v1 lmd_lmm;  /* LOV EA V1 user data */
+} __attribute__((packed));
+
+struct lov_user_mds_data_v3 {
+        lstat_t lmd_st;                 /* MDS stat struct */
+        struct lov_user_md_v3 lmd_lmm;  /* LOV EA V3 user data */
 } __attribute__((packed));
 #endif
 
index 266d6f5..e52a9f3 100644 (file)
@@ -73,6 +73,10 @@ enum lcfg_command_type {
         LCFG_ADD_MDC        = 0x00cf014,
         LCFG_DEL_MDC        = 0x00cf015,
         LCFG_SPTLRPC_CONF   = 0x00ce016,
+        LCFG_POOL_NEW       = 0x00ce020,
+        LCFG_POOL_ADD       = 0x00ce021,
+        LCFG_POOL_REM       = 0x00ce022,
+        LCFG_POOL_DEL       = 0x00ce023,
 };
 
 struct lustre_cfg_bufs {
@@ -222,7 +226,7 @@ static inline struct lustre_cfg *lustre_cfg_new(int cmd,
         OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount,
                                        bufs->lcfg_buflen));
         if (!lcfg)
-                RETURN(lcfg);
+                RETURN(ERR_PTR(-ENOMEM));
 
         lcfg->lcfg_version = LUSTRE_CFG_VERSION;
         lcfg->lcfg_command = cmd;
index 0a8eb9a..83697fe 100644 (file)
@@ -498,6 +498,7 @@ static inline void obd_ioctl_freedata(char *buf, int len)
 #define OBD_IOC_DUMP_LOG               _IOWR('f', 185, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_CLEAR_LOG              _IOWR('f', 186, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_PARAM                  _IOW ('f', 187, OBD_IOC_DATA_TYPE)
+#define OBD_IOC_POOL                   _IOWR('f', 188, OBD_IOC_DATA_TYPE)
 
 #define OBD_IOC_CATLOGLIST             _IOWR('f', 190, OBD_IOC_DATA_TYPE)
 #define OBD_IOC_LLOG_INFO              _IOWR('f', 191, OBD_IOC_DATA_TYPE)
index 109c9af..81cd3d8 100644 (file)
@@ -150,6 +150,7 @@ struct lov_stripe_md {
                 __u32 lw_stripe_size;      /* size of the stripe */
                 __u32 lw_pattern;          /* striping pattern (RAID0, RAID1) */
                 unsigned lw_stripe_count;  /* number of objects being striped over */
+                char  lw_pool_name[MAXPOOLNAME]; /* pool name */
         } lsm_wire;
 
         struct lov_array_info *lsm_array; /*Only for joined file array info*/
@@ -163,6 +164,7 @@ struct lov_stripe_md {
 #define lsm_stripe_size  lsm_wire.lw_stripe_size
 #define lsm_pattern      lsm_wire.lw_pattern
 #define lsm_stripe_count lsm_wire.lw_stripe_count
+#define lsm_pool_name    lsm_wire.lw_pool_name
 
 struct obd_info;
 
@@ -649,15 +651,32 @@ struct ltd_qos {
         unsigned int        ltq_usable:1;    /* usable for striping */
 };
 
+/* Generic subset of OSTs */
+struct ost_pool {
+        __u32              *op_array;        /* array of index of
+                                                lov_obd->lov_tgts */
+        unsigned int        op_count;        /* number of OSTs in the array */
+        unsigned int        op_size;         /* allocated size of lp_array */
+        rwlock_t            op_rwlock;       /* to protect lov_pool use */
+};
+
+/* Round-robin allocator data */
+struct lov_qos_rr {
+        __u32               lqr_start_idx;   /* start index of new inode */
+        __u32               lqr_offset_idx;  /* aliasing for start_idx  */
+        int                 lqr_start_count; /* reseed counter */
+        struct ost_pool     lqr_pool;        /* round-robin optimized list */
+        unsigned long       lqr_dirty:1;     /* recalc round-robin list */
+};
+
+/* Stripe placement optimization */
 struct lov_qos {
         struct list_head    lq_oss_list;    /* list of OSSs that targets use */
         struct rw_semaphore lq_rw_sem;
         __u32               lq_active_oss_count;
-        __u32              *lq_rr_array;    /* round-robin optimized list */
-        unsigned int        lq_rr_size;     /* rr array size */
         unsigned int        lq_prio_free;   /* priority for free space */
+        struct lov_qos_rr   lq_rr;          /* round robin qos data */
         unsigned long       lq_dirty:1,     /* recalc qos data */
-                            lq_dirty_rr:1,  /* recalc round-robin list */
                             lq_same_space:1,/* the ost's all have approx.
                                                the same space avail */
                             lq_reset:1;     /* zero current penalties */
@@ -674,9 +693,29 @@ struct lov_tgt_desc {
                             ltd_reap:1;  /* should this target be deleted */
 };
 
+/* Pool metadata */
+#define pool_tgt_size(_p)   _p->pool_obds.op_size
+#define pool_tgt_count(_p)  _p->pool_obds.op_count
+#define pool_tgt_array(_p)  _p->pool_obds.op_array
+#define pool_tgt_rwlock(_p) _p->pool_obds.op_rwlock
+#define pool_tgt(_p, _i)    _p->pool_lov->lov_tgts[_p->pool_obds.op_array[_i]]
+
+struct pool_desc {
+        char                    pool_name[MAXPOOLNAME + 1]; /* name of pool */
+        struct ost_pool         pool_obds;              /* pool members */
+        struct lov_qos_rr       pool_rr;                /* round robin qos */
+        struct hlist_node       pool_hash;              /* access by poolname */
+        struct list_head        pool_list;              /* serial access */
+        cfs_proc_dir_entry_t   *pool_proc_entry;        /* file in /proc */
+        struct lov_obd         *pool_lov;               /* lov obd to which this
+                                                           pool belong */
+};
+
 struct lov_obd {
         struct lov_desc         desc;
-        struct lov_tgt_desc   **lov_tgts;
+        struct lov_tgt_desc   **lov_tgts;              /* sparse array */
+        struct ost_pool         lov_packed;            /* all OSTs in a packed
+                                                          array */
         struct semaphore        lov_lock;
         struct obd_connect_data lov_ocd;
         struct lov_qos          lov_qos;               /* qos info per lov */
@@ -685,13 +724,14 @@ struct lov_obd {
         __u32                   lov_active_tgt_count;  /* how many active */
         __u32                   lov_death_row;/* tgts scheduled to be deleted */
         __u32                   lov_tgt_size;   /* size of tgts array */
-        __u32                   lov_start_idx;  /* start index of new inode */
-        __u32                   lov_offset_idx; /* aliasing for start_idx  */
-        int                     lov_start_count;/* reseed counter */
         int                     lov_connects;
         obd_page_removal_cb_t   lov_page_removal_cb;
         obd_pin_extent_cb       lov_page_pin_cb;
         obd_lock_cancel_cb      lov_lock_cancel_cb;
+        int                     lov_pool_count;
+        lustre_hash_t          *lov_pools_hash_body; /* used for key access */
+        struct list_head        lov_pool_list; /* used for sequential access */
+        cfs_proc_dir_entry_t   *lov_pool_proc_entry;
 };
 
 struct lmv_tgt_desc {
@@ -1340,7 +1380,13 @@ struct obd_ops {
                                        obd_lock_cancel_cb cb);
         int (*o_unregister_lock_cancel_cb)(struct obd_export *exp,
                                          obd_lock_cancel_cb cb);
-
+        /* pools methods */
+        int (*o_pool_new)(struct obd_device *obd, char *poolname);
+        int (*o_pool_del)(struct obd_device *obd, char *poolname);
+        int (*o_pool_add)(struct obd_device *obd, char *poolname,
+                          char *ostname);
+        int (*o_pool_rem)(struct obd_device *obd, char *poolname,
+                          char *ostname);
         /*
          * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line
          * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c.
@@ -1511,15 +1557,18 @@ struct lsm_operations {
                              struct lov_mds_md *lmm);
 };
 
-extern struct lsm_operations lsm_plain_ops;
+extern struct lsm_operations lsm_v1_ops;
 extern struct lsm_operations lsm_join_ops;
+extern struct lsm_operations lsm_v3_ops;
 static inline struct lsm_operations *lsm_op_find(int magic)
 {
         switch(magic) {
-        case LOV_MAGIC:
-               return &lsm_plain_ops;
+        case LOV_MAGIC_V1:
+               return &lsm_v1_ops;
         case LOV_MAGIC_JOIN:
                return &lsm_join_ops;
+        case LOV_MAGIC_V3:
+               return &lsm_v3_ops;
         default:
                CERROR("Cannot recognize lsm_magic %d\n", magic);
                return NULL;
index 8a73f27..fa44819 100644 (file)
@@ -946,6 +946,54 @@ static inline int obd_ping(struct obd_export *exp)
         RETURN(rc);
 }
 
+static inline int obd_pool_new(struct obd_device *obd, char *poolname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_new);
+
+        rc = OBP(obd, pool_new)(obd, poolname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_del(struct obd_device *obd, char *poolname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_del);
+
+        rc = OBP(obd, pool_del)(obd, poolname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_add);
+
+        rc = OBP(obd, pool_add)(obd, poolname, ostname);
+        RETURN(rc);
+}
+
+static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname)
+{
+        int rc;
+        ENTRY;
+
+        OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP);
+        OBD_COUNTER_INCREMENT(obd, pool_rem);
+
+        rc = OBP(obd, pool_rem)(obd, poolname, ostname);
+        RETURN(rc);
+}
+
 static inline int obd_init_export(struct obd_export *exp)
 {
         int rc = 0;
index 64798ab..da3ca51 100644 (file)
@@ -42,13 +42,17 @@ static inline int lov_stripe_md_size(int stripes)
         return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*);
 }
 
-#define lov_mds_md_size(stripes) lov_mds_md_v1_size(stripes)
-static inline int lov_mds_md_v1_size(int stripes)
+static inline int lov_mds_md_size(int stripes, int lmm_magic)
 {
-        return sizeof(struct lov_mds_md_v1) +
-                stripes * sizeof(struct lov_ost_data_v1);
+        if (lmm_magic == LOV_MAGIC_V3)
+                return sizeof(struct lov_mds_md_v3) +
+                        stripes * sizeof(struct lov_ost_data_v1);
+        else
+                return sizeof(struct lov_mds_md_v1) +
+                        stripes * sizeof(struct lov_ost_data_v1);
 }
 
+
 #define IOC_LOV_TYPE                   'g'
 #define IOC_LOV_MIN_NR                 50
 #define IOC_LOV_SET_OSC_ACTIVE         _IOWR('g', 50, long)
index 3978b71..27cd186 100644 (file)
@@ -345,7 +345,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 
         cli->cl_import = imp;
         /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */
-        cli->cl_max_mds_easize = sizeof(struct lov_mds_md);
+        cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3);
         cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie);
 
         if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) {
index 552a586..d7d37cf 100644 (file)
@@ -1730,11 +1730,25 @@ static int llu_lov_dir_setstripe(struct inode *ino, unsigned long arg)
         if (rc)
                 return(-EFAULT);
 
-        if (lum.lmm_magic != LOV_USER_MAGIC)
+        switch (lum.lmm_magic) {
+        case LOV_USER_MAGIC_V1: {
+                if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+                        lustre_swab_lov_user_md_v1(&lum);
+                break;
+                }
+        case LOV_USER_MAGIC_V3: {
+                if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)&lum);
+                break;
+                }
+        default: {
+                CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+                                " %#08x != %#08x nor %#08x\n",
+                                lum.lmm_magic, LOV_USER_MAGIC_V1,
+                                LOV_USER_MAGIC_V3);
                 RETURN(-EINVAL);
-
-        if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC))
-                lustre_swab_lov_user_md(&lum);
+        }
+        }
 
         /* swabbing is done in lov_setstripe() on server side */
         rc = md_setattr(sbi->ll_md_exp, &op_data, &lum,
@@ -1968,7 +1982,9 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md)
 static int
 llu_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
 {
-        struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC };
+        /* even if default lov is LOV_MAGIC_V1 we use LOV_MAGIC_V3
+         * to be sure buffer are large enough */
+        struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 };
         __u32 valsize = sizeof(struct lov_desc);
         int rc, easize, def_easize, cookiesize;
         struct lov_desc desc;
index 55fa6a8..930bd26 100644 (file)
@@ -555,17 +555,34 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
         struct lustre_sb_info *lsi = s2lsi(inode->i_sb);
         struct obd_device *mgc = lsi->lsi_mgc;
         char *fsname = NULL, *param = NULL;
+        int lum_size;
 
         /*
          * This is coming from userspace, so should be in
          * local endian.  But the MDS would like it in little
          * endian, so we swab it before we send it.
          */
-        if (lump->lmm_magic != LOV_USER_MAGIC)
+        switch (lump->lmm_magic) {
+        case LOV_USER_MAGIC_V1: {
+                if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1))
+                        lustre_swab_lov_user_md_v1(lump);
+                lum_size = sizeof(struct lov_user_md_v1);
+                break;
+                }
+        case LOV_USER_MAGIC_V3: {
+                if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3))
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lump);
+                lum_size = sizeof(struct lov_user_md_v3);
+                break;
+                }
+        default: {
+                CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
+                                " %#08x != %#08x nor %#08x\n",
+                                lump->lmm_magic, LOV_USER_MAGIC_V1,
+                                LOV_USER_MAGIC_V3);
                 RETURN(-EINVAL);
-
-        if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC))
-                lustre_swab_lov_user_md(lump);
+                }
+        }
 
         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
                                      LUSTRE_OPC_ANY, NULL);
@@ -573,7 +590,7 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                 RETURN(PTR_ERR(op_data));
 
         /* swabbing is done in lov_setstripe() on server side */
-        rc = md_setattr(sbi->ll_md_exp, op_data, lump, sizeof(*lump),
+        rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size,
                         NULL, 0, &req, NULL);
         ll_finish_md_op_data(op_data);
         ptlrpc_req_finished(req);
@@ -582,6 +599,9 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                         CERROR("mdc_setattr fails: rc = %d\n", rc);
         }
 
+        /* In the following we use the fact that LOV_USER_MAGIC_V1 and
+         LOV_USER_MAGIC_V3 have the same initial fields so we do not
+         need the make the distiction between the 2 versions */
         if (set_default && mgc->u.cli.cl_mgc_mgsexp) {
                 OBD_ALLOC(param, MGS_PARAM_MAXLEN);
 
@@ -661,8 +681,19 @@ int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp,
          * little endian.  We convert it to host endian before
          * passing it to userspace.
          */
-        if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
-                lustre_swab_lov_user_md((struct lov_user_md *)lmm);
+        /* We don't swab objects for directories */
+        switch (le32_to_cpu(lmm->lmm_magic)) {
+        case LOV_MAGIC_V1:
+                if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                break;
+        case LOV_MAGIC_V3:
+                if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC))
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                break;
+        default:
+                CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic);
+                rc = -EPROTO;
         }
 out:
         *lmmp = lmm;
@@ -737,21 +768,33 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                 return rc;
         }
         case LL_IOC_LOV_SETSTRIPE: {
-                struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
+                struct lov_user_md_v3 lumv3;
+                struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+                struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+                struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+
                 int rc = 0;
                 int set_default = 0;
 
-                LASSERT(sizeof(lum) == sizeof(*lump));
-                LASSERT(sizeof(lum.lmm_objects[0]) ==
-                        sizeof(lump->lmm_objects[0]));
-                rc = copy_from_user(&lum, lump, sizeof(lum));
+                LASSERT(sizeof(lumv3) == sizeof(*lumv3p));
+                LASSERT(sizeof(lumv3.lmm_objects[0]) ==
+                        sizeof(lumv3p->lmm_objects[0]));
+                /* first try with v1 which is smaller than v3 */
+                rc = copy_from_user(lumv1, lumv1p, sizeof(*lumv1));
                 if (rc)
                         RETURN(-EFAULT);
 
+                if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+                        rc = copy_from_user(&lumv3, lumv3p, sizeof(lumv3));
+                        if (rc)
+                                RETURN(-EFAULT);
+                }
+
                 if (inode->i_sb->s_root == file->f_dentry)
                         set_default = 1;
 
-                rc = ll_dir_setstripe(inode, &lum, set_default);
+                /* in v1 and v3 cases lumv1 points to data */
+                rc = ll_dir_setstripe(inode, lumv1, set_default);
 
                 RETURN(rc);
         }
@@ -863,6 +906,29 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file,
                 if (rc)
                         GOTO(free_lmm, rc = -EFAULT);
 
+                switch (lmm->lmm_magic) {
+                case LOV_USER_MAGIC_V1:
+                        if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1))
+                                break;
+                        /* swab objects first so that stripes num will be sane */
+                        lustre_swab_lov_user_md_objects(
+                                ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+                                ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                        break;
+                case LOV_USER_MAGIC_V3:
+                        if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3))
+                                break;
+                        /* swab objects first so that stripes num will be sane */
+                        lustre_swab_lov_user_md_objects(
+                                ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+                                ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                        break;
+                default:
+                        GOTO(free_lmm, rc = -EINVAL);
+                }
+
                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
                 if (rc < 0)
                         GOTO(free_lmm, rc = -ENOMEM);
index 438593b..96961fc 100644 (file)
@@ -2091,16 +2091,35 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
         LASSERT(lmm != NULL);
 
+        if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
+            (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
+            (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
+                GOTO(out, rc = -EPROTO);
+        }
+
         /*
          * This is coming from the MDS, so is probably in
          * little endian.  We convert it to host endian before
          * passing it to userspace.
          */
-        if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
-                lustre_swab_lov_user_md((struct lov_user_md *)lmm);
-                lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
-        } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
-                lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
+        if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
+                /* if function called for directory - we should
+                 * avoid swab not existent lsm objects */
+                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
+                        lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
+                        if (S_ISREG(body->mode))
+                                lustre_swab_lov_user_md_objects(
+                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
+                                 ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
+                } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
+                        lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
+                        if (S_ISREG(body->mode))
+                                lustre_swab_lov_user_md_objects(
+                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
+                                 ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
+                } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
+                        lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
+                }
         }
 
         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
@@ -2193,23 +2212,34 @@ static int ll_lov_setea(struct inode *inode, struct file *file,
 static int ll_lov_setstripe(struct inode *inode, struct file *file,
                             unsigned long arg)
 {
-        struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
+        struct lov_user_md_v3 lumv3;
+        struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+        struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
+        struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
+        int lum_size;
         int rc;
         int flags = FMODE_WRITE;
         ENTRY;
 
-        /* Bug 1152: copy properly when this is no longer true */
-        LASSERT(sizeof(lum) == sizeof(*lump));
-        LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
-        rc = copy_from_user(&lum, lump, sizeof(lum));
+        /* first try with v1 which is smaller than v3 */
+        lum_size = sizeof(struct lov_user_md_v1);
+        rc = copy_from_user(lumv1, lumv1p, lum_size);
         if (rc)
                 RETURN(-EFAULT);
 
-        rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
+        if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
+                lum_size = sizeof(struct lov_user_md_v3);
+                rc = copy_from_user(&lumv3, lumv3p, lum_size);
+                if (rc)
+                        RETURN(-EFAULT);
+        }
+
+        rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
         if (rc == 0) {
-                 put_user(0, &lump->lmm_stripe_count);
+                 put_user(0, &lumv1p->lmm_stripe_count);
                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
-                                    0, ll_i2info(inode)->lli_smd, lump);
+                                    0, ll_i2info(inode)->lli_smd,
+                                    (void *)arg);
         }
         RETURN(rc);
 }
index b42fb5a..257516f 100644 (file)
@@ -248,7 +248,7 @@ static struct dentry_operations ll_d_root_ops = {
  * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */
 static int ll_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp)
 {
-        struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC };
+        struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 };
         __u32 valsize = sizeof(struct lov_desc);
         int rc, easize, def_easize, cookiesize;
         struct lov_desc desc;
@@ -316,7 +316,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
                                   OBD_CONNECT_JOIN     | OBD_CONNECT_ATTRFID  |
                                   OBD_CONNECT_VERSION  | OBD_CONNECT_MDS_CAPA |
                                   OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET|
-                                  OBD_CONNECT_FID      | OBD_CONNECT_AT;
+                                  OBD_CONNECT_FID      | OBD_CONNECT_AT |
+                                  OBD_CONNECT_LOV_V3;
 
 #ifdef HAVE_LRU_RESIZE_SUPPORT
         if (sbi->ll_flags & LL_SBI_LRU_RESIZE)
@@ -1802,7 +1803,8 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
         LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
         if (lsm != NULL) {
                 if (lli->lli_smd == NULL) {
-                        if (lsm->lsm_magic != LOV_MAGIC &&
+                        if (lsm->lsm_magic != LOV_MAGIC_V1 &&
+                            lsm->lsm_magic != LOV_MAGIC_V3 &&
                             lsm->lsm_magic != LOV_MAGIC_JOIN) {
                                 dump_lsm(D_ERROR, lsm);
                                 LBUG();
index f714192..0f223f8 100644 (file)
@@ -1,4 +1,4 @@
 MODULES := lov
-lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o
+lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o lov_pool.o
 
 @INCLUDE_RULES@
index 8c3af02..c65e095 100644 (file)
@@ -36,7 +36,7 @@
 
 if LIBLUSTRE
 noinst_LIBRARIES = liblov.a
-liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h
+liblov_a_SOURCES = lov_log.c lov_pool.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h
 liblov_a_CPPFLAGS = $(LLCPPFLAGS)
 liblov_a_CFLAGS = $(LLCFLAGS)
 endif
@@ -51,6 +51,7 @@ macos_PROGRAMS := lov
 
 lov_SOURCES :=          \
         lov_log.c       \
+        lov_pool.c     \
         lov_obd.c       \
         lov_pack.c      \
         lov_request.c   \
index 8167469..1ea9d70 100755 (executable)
@@ -68,19 +68,19 @@ static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
 
         if (stripe_count == 0 || stripe_count > LOV_V1_INSANE_STRIPE_COUNT) {
                 CERROR("bad stripe count %d\n", stripe_count);
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                lov_dump_lmm(D_WARNING, lmm);
                 return -EINVAL;
         }
-        
+
         if (lmm->lmm_object_id == 0) {
                 CERROR("zero object id\n");
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                lov_dump_lmm(D_WARNING, lmm);
                 return -EINVAL;
         }
-        
+
         if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) {
                 CERROR("bad striping pattern\n");
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                lov_dump_lmm(D_WARNING, lmm);
                 return -EINVAL;
         }
 
@@ -90,7 +90,7 @@ static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes,
              0xffffffff)) {
                 CERROR("bad stripe size %u\n",
                        le32_to_cpu(lmm->lmm_stripe_size));
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                lov_dump_lmm(D_WARNING, lmm);
                 return -EINVAL;
         }
         return 0;
@@ -118,6 +118,7 @@ struct lov_stripe_md *lsm_alloc_plain(int stripe_count, int *size)
                 lsm->lsm_oinfo[i] = loi;
         }
         lsm->lsm_stripe_count = stripe_count;
+        lsm->lsm_pool_name[0] = '\0';
         return lsm;
 
 err:
@@ -142,10 +143,15 @@ void lsm_free_plain(struct lov_stripe_md *lsm)
 static void lsm_unpackmd_common(struct lov_stripe_md *lsm,
                                 struct lov_mds_md *lmm)
 {
+        /*
+         * This supposes lov_mds_md_v1/v3 first fields are
+         * are the same
+         */
         lsm->lsm_object_id = le64_to_cpu(lmm->lmm_object_id);
         lsm->lsm_object_gr = le64_to_cpu(lmm->lmm_object_gr);
         lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size);
         lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern);
+        lsm->lsm_pool_name[0] = '\0';
 }
 
 static void
@@ -197,20 +203,20 @@ static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa,
         return 0;
 }
 
-static int lsm_lmm_verify_plain(struct lov_mds_md *lmm, int lmm_bytes,
+static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes,
                              int *stripe_count)
 {
         if (lmm_bytes < sizeof(*lmm)) {
-                CERROR("lov_mds_md too small: %d, need at least %d\n",
+                CERROR("lov_mds_md_v1 too small: %d, need at least %d\n",
                        lmm_bytes, (int)sizeof(*lmm));
                 return -EINVAL;
         }
 
         *stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
 
-        if (lmm_bytes < lov_mds_md_v1_size(*stripe_count)) {
-                CERROR("LOV EA too small: %d, need %d\n",
-                       lmm_bytes, lov_mds_md_v1_size(*stripe_count));
+        if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) {
+                CERROR("LOV EA V1 too small: %d, need %d\n",
+                       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1));
                 lov_dump_lmm_v1(D_WARNING, lmm);
                 return -EINVAL;
         }
@@ -218,7 +224,7 @@ static int lsm_lmm_verify_plain(struct lov_mds_md *lmm, int lmm_bytes,
         return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count);
 }
 
-int lsm_unpackmd_plain(struct lov_obd *lov, struct lov_stripe_md *lsm,
+int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm,
                     struct lov_mds_md_v1 *lmm)
 {
         struct lov_oinfo *loi;
@@ -249,7 +255,7 @@ int lsm_unpackmd_plain(struct lov_obd *lov, struct lov_stripe_md *lsm,
         return 0;
 }
 
-struct lsm_operations lsm_plain_ops = {
+struct lsm_operations lsm_v1_ops = {
         .lsm_free            = lsm_free_plain,
         .lsm_destroy         = lsm_destroy_plain,
         .lsm_stripe_by_index    = lsm_stripe_by_index_plain,
@@ -258,8 +264,8 @@ struct lsm_operations lsm_plain_ops = {
         .lsm_stripe_offset_by_index  = lsm_stripe_offset_by_index_plain,
         .lsm_stripe_offset_by_offset = lsm_stripe_offset_by_offset_plain,
         .lsm_stripe_index_by_offset  = lsm_stripe_index_by_offset_plain,
-        .lsm_lmm_verify         = lsm_lmm_verify_plain,
-        .lsm_unpackmd           = lsm_unpackmd_plain,
+        .lsm_lmm_verify         = lsm_lmm_verify_v1,
+        .lsm_unpackmd           = lsm_unpackmd_v1,
 };
 
 struct lov_extent *lovea_off2le(struct lov_stripe_md *lsm, obd_off lov_off)
@@ -625,3 +631,79 @@ struct lsm_operations lsm_join_ops = {
         .lsm_lmm_verify         = lsm_lmm_verify_join,
         .lsm_unpackmd           = lsm_unpackmd_join,
 };
+
+
+static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes,
+                             int *stripe_count)
+{
+        struct lov_mds_md_v3 *lmm;
+
+        lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+        if (lmm_bytes < sizeof(*lmm)) {
+                CERROR("lov_mds_md_v3 too small: %d, need at least %d\n",
+                       lmm_bytes, (int)sizeof(*lmm));
+                return -EINVAL;
+        }
+
+        *stripe_count = le32_to_cpu(lmm->lmm_stripe_count);
+
+        if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) {
+                CERROR("LOV EA V3 too small: %d, need %d\n",
+                       lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3));
+                lov_dump_lmm_v3(D_WARNING, lmm);
+                return -EINVAL;
+        }
+
+        return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes,
+                                     *stripe_count);
+}
+
+int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm,
+                    struct lov_mds_md *lmmv1)
+{
+        struct lov_mds_md_v3 *lmm;
+        struct lov_oinfo *loi;
+        int i;
+
+        lmm = (struct lov_mds_md_v3 *)lmmv1;
+
+        lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm);
+        strncpy(lsm->lsm_pool_name, lmm->lmm_pool_name, MAXPOOLNAME);
+
+        for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                /* XXX LOV STACKING call down to osc_unpackmd() */
+                loi = lsm->lsm_oinfo[i];
+                loi->loi_id = le64_to_cpu(lmm->lmm_objects[i].l_object_id);
+                loi->loi_gr = le64_to_cpu(lmm->lmm_objects[i].l_object_gr);
+                loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx);
+                loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen);
+                if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) {
+                        CERROR("OST index %d more than OST count %d\n",
+                               loi->loi_ost_idx, lov->desc.ld_tgt_count);
+                        lov_dump_lmm_v3(D_WARNING, lmm);
+                        return -EINVAL;
+                }
+                if (!lov->lov_tgts[loi->loi_ost_idx]) {
+                        CERROR("OST index %d missing\n", loi->loi_ost_idx);
+                        lov_dump_lmm_v3(D_WARNING, lmm);
+                        return -EINVAL;
+                }
+        }
+
+        return 0;
+}
+
+struct lsm_operations lsm_v3_ops = {
+        .lsm_free            = lsm_free_plain,
+        .lsm_destroy         = lsm_destroy_plain,
+        .lsm_stripe_by_index    = lsm_stripe_by_index_plain,
+        .lsm_stripe_by_offset   = lsm_stripe_by_offset_plain,
+        .lsm_revalidate         = lsm_revalidate_plain,
+        .lsm_stripe_offset_by_index  = lsm_stripe_offset_by_index_plain,
+        .lsm_stripe_offset_by_offset = lsm_stripe_offset_by_offset_plain,
+        .lsm_stripe_index_by_offset  = lsm_stripe_index_by_offset_plain,
+        .lsm_lmm_verify         = lsm_lmm_verify_v3,
+        .lsm_unpackmd           = lsm_unpackmd_v3,
+};
+
index 77154c7..9a1d66d 100644 (file)
@@ -285,6 +285,9 @@ void lov_free_memmd(struct lov_stripe_md **lsmp);
 
 void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm);
 void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj);
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm);
+void lov_dump_lmm(int level, void *lmm);
+
 /* lov_ea.c */
 int lov_unpackmd_join(struct lov_obd *lov, struct lov_stripe_md *lsm,
                       struct lov_mds_md *lmm);
@@ -306,4 +309,23 @@ static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
 }
 #endif
 
+/* pools */
+extern lustre_hash_ops_t pool_hash_operations;
+/* ost_pool methods */
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count);
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int max_count);
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int max_count);
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx);
+int lov_ost_pool_free(struct ost_pool *op);
+
+/* high level pool methods */
+int lov_pool_new(struct obd_device *obd, char *poolname);
+int lov_pool_del(struct obd_device *obd, char *poolname);
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname);
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname);
+void lov_dump_pool(int level, struct pool_desc *pool);
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
+
+
 #endif
index 739fee3..2456372 100644 (file)
@@ -667,7 +667,6 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
                        lov->lov_tgts, lov->lov_tgt_size);
         }
 
-
         OBD_ALLOC_PTR(tgt);
         if (!tgt) {
                 mutex_up(&lov->lov_lock);
@@ -683,6 +682,11 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
         lov->lov_tgts[index] = tgt;
         if (index >= lov->desc.ld_tgt_count)
                 lov->desc.ld_tgt_count = index + 1;
+
+        rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size);
+        if (rc)
+                RETURN(rc);
+
         mutex_up(&lov->lov_lock);
 
         CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n",
@@ -781,6 +785,7 @@ static void __lov_del_obd(struct obd_device *obd, __u32 index)
          * maximum tgt index for computing the mds_max_easize. So we can't
          * shrink it. */
 
+        lov_ost_pool_remove(&lov->lov_packed, index);
         lov->lov_tgts[index] = NULL;
         OBD_FREE_PTR(tgt);
 
@@ -841,6 +846,7 @@ static int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         struct lov_desc *desc;
         struct lov_obd *lov = &obd->u.lov;
         int count;
+        int rc;
         ENTRY;
 
         if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) {
@@ -884,16 +890,27 @@ static int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         desc->ld_active_tgt_count = 0;
         lov->desc = *desc;
         lov->lov_tgt_size = 0;
+        rc = lov_ost_pool_init(&lov->lov_packed, 0);
+        if (rc)
+                RETURN(rc);
+
         sema_init(&lov->lov_lock, 1);
         atomic_set(&lov->lov_refcount, 0);
         CFS_INIT_LIST_HEAD(&lov->lov_qos.lq_oss_list);
         init_rwsem(&lov->lov_qos.lq_rw_sem);
         lov->lov_qos.lq_dirty = 1;
-        lov->lov_qos.lq_dirty_rr = 1;
+        lov->lov_qos.lq_rr.lqr_dirty = 1;
         lov->lov_qos.lq_reset = 1;
         /* Default priority is toward free space balance */
         lov->lov_qos.lq_prio_free = 232;
 
+        lov->lov_pools_hash_body = lustre_hash_init("POOLS", 128, 128,
+                                                    &pool_hash_operations,
+                                                    0);
+
+        CFS_INIT_LIST_HEAD(&lov->lov_pool_list);
+        lov->lov_pool_count = 0;
+
         lprocfs_lov_init_vars(&lvars);
         lprocfs_obd_setup(obd, lvars.obd_vars);
 #ifdef LPROCFS
@@ -906,6 +923,9 @@ static int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                         CWARN("Error adding the target_obd file\n");
         }
 #endif
+        lov->lov_pool_proc_entry = lprocfs_register("pools",
+                                                    obd->obd_proc_entry,
+                                                    NULL, NULL);
 
         RETURN(0);
 }
@@ -939,8 +959,23 @@ static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
 static int lov_cleanup(struct obd_device *obd)
 {
         struct lov_obd *lov = &obd->u.lov;
+        struct list_head *pos, *tmp;
+        struct pool_desc *pool;
+
+        list_for_each_safe(pos, tmp, &lov->lov_pool_list) {
+                pool = list_entry(pos, struct pool_desc, pool_list);
+                list_del(&pool->pool_list);
+                lustre_hash_del_key(lov->lov_pools_hash_body, pool->pool_name);
+                lov_ost_pool_free(&(pool->pool_rr.lqr_pool));
+                lov_ost_pool_free(&(pool->pool_obds));
+                OBD_FREE(pool, sizeof(*pool));
+        }
+        lustre_hash_exit(lov->lov_pools_hash_body);
 
         lprocfs_obd_cleanup(obd);
+
+        lov_ost_pool_free(&lov->lov_packed);
+
         if (lov->lov_tgts) {
                 int i;
                 for (i = 0; i < lov->desc.ld_tgt_count; i++) {
@@ -964,8 +999,7 @@ static int lov_cleanup(struct obd_device *obd)
                 lov->lov_tgt_size = 0;
         }
 
-        if (lov->lov_qos.lq_rr_size)
-                OBD_FREE(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size);
+        lov_ost_pool_free(&(lov->lov_qos.lq_rr.lqr_pool));
 
         RETURN(0);
 }
@@ -1015,6 +1049,12 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf)
                                               lcfg, obd);
                 GOTO(out, rc);
         }
+        case LCFG_POOL_NEW:
+        case LCFG_POOL_ADD:
+        case LCFG_POOL_DEL:
+        case LCFG_POOL_REM:
+                GOTO(out, rc);
+
         default: {
                 CERROR("Unknown command: %d\n", lcfg->lcfg_command);
                 GOTO(out, rc = -EINVAL);
@@ -1193,7 +1233,8 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa,
 #define ASSERT_LSM_MAGIC(lsmp)                                                  \
 do {                                                                            \
         LASSERT((lsmp) != NULL);                                                \
-        LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC ||                             \
+        LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 ||                          \
+                 (lsmp)->lsm_magic == LOV_MAGIC_V3 ||                           \
                  (lsmp)->lsm_magic == LOV_MAGIC_JOIN), "%p->lsm_magic=%x\n",    \
                  (lsmp), (lsmp)->lsm_magic);                                    \
 } while (0)
@@ -3332,6 +3373,10 @@ struct obd_ops lov_obd_ops = {
         .o_unregister_page_removal_cb = lov_unregister_page_removal_cb,
         .o_register_lock_cancel_cb = lov_register_lock_cancel_cb,
         .o_unregister_lock_cancel_cb = lov_unregister_lock_cancel_cb,
+        .o_pool_new            = lov_pool_new,
+        .o_pool_rem            = lov_pool_remove,
+        .o_pool_add            = lov_pool_add,
+        .o_pool_del            = lov_pool_del,
 };
 
 static quota_interface_t *quota_interface;
index 8b2003d..b02c980 100644 (file)
@@ -94,6 +94,52 @@ void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj)
                le32_to_cpu(lmmj->lmmj_extent_count));
 }
 
+void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm)
+{
+        struct lov_ost_data_v1 *lod;
+        int i;
+
+        CDEBUG(level, "objid "LPX64", magic 0x%08x, pattern %#x\n",
+               le64_to_cpu(lmm->lmm_object_id), le32_to_cpu(lmm->lmm_magic),
+               le32_to_cpu(lmm->lmm_pattern));
+        CDEBUG(level,"stripe_size %u, stripe_count %u\n",
+               le32_to_cpu(lmm->lmm_stripe_size),
+               le32_to_cpu(lmm->lmm_stripe_count));
+        CDEBUG(level,"pool_name "POOLNAMEF"\n", lmm->lmm_pool_name);
+
+        if (le32_to_cpu(lmm->lmm_stripe_count) <= LOV_V1_INSANE_STRIPE_COUNT) {
+                for (i = 0, lod = lmm->lmm_objects;
+                     i < (int)le32_to_cpu(lmm->lmm_stripe_count); i++, lod++)
+                         CDEBUG(level,
+                                "stripe %u idx %u subobj "LPX64"/"LPX64"\n",
+                                i, le32_to_cpu(lod->l_ost_idx),
+                                le64_to_cpu(lod->l_object_gr),
+                                le64_to_cpu(lod->l_object_id));
+        } else {
+                CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n",
+                       le32_to_cpu(lmm->lmm_stripe_count),
+                       LOV_V1_INSANE_STRIPE_COUNT);
+        }
+}
+
+void lov_dump_lmm(int level, void *lmm)
+{
+        int magic;
+
+        magic = ((struct lov_mds_md_v1 *)(lmm))->lmm_magic;
+        switch (magic) {
+        case LOV_MAGIC_V1:
+                return lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)(lmm));
+        case LOV_MAGIC_JOIN:
+                return lov_dump_lmm_join(level, (struct lov_mds_md_join *)(lmm));
+        case LOV_MAGIC_V3:
+                return lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)(lmm));
+        default:
+                CERROR("Cannot recognize lmm_magic %x", magic);
+        }
+        return;
+}
+
 #define LMM_ASSERT(test)                                                \
 do {                                                                    \
         if (!(test)) lov_dump_lmm(D_ERROR, lmm);                        \
@@ -113,37 +159,51 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
 {
         struct obd_device *obd = class_exp2obd(exp);
         struct lov_obd *lov = &obd->u.lov;
-        struct lov_mds_md *lmm;
+        struct lov_mds_md_v1 *lmmv1;
+        struct lov_mds_md_v3 *lmmv3;
         int stripe_count = lov->desc.ld_tgt_count;
-        int lmm_size;
+        struct lov_ost_data_v1 *lmm_objects;
+        int lmm_size, lmm_magic;
         int i;
         ENTRY;
 
         if (lsm) {
-                if (lsm->lsm_magic != LOV_MAGIC) {
-                        CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X\n",
-                               lsm->lsm_magic, LOV_MAGIC);
-                        RETURN(-EINVAL);
-                }
+                lmm_magic = lsm->lsm_magic;
+
                 /* If we are just sizing the EA, limit the stripe count
                  * to the actual number of OSTs in this filesystem. */
                 if (!lmmp) {
-                        stripe_count = lov_get_stripecnt(lov, lsm->lsm_stripe_count);
+                        stripe_count = lov_get_stripecnt(lov,
+                                                         lsm->lsm_stripe_count);
                         lsm->lsm_stripe_count = stripe_count;
                 } else {
                         stripe_count = lsm->lsm_stripe_count;
                 }
+        } else if (lmmp && *lmmp) {
+                lmm_magic = le32_to_cpu((*lmmp)->lmm_magic);
+        } else {
+                /* lsm == NULL and lmmp == NULL */
+                lmm_magic = LOV_MAGIC;
+        }
+
+        if ((lmm_magic != LOV_MAGIC_V1) &&
+            (lmm_magic != LOV_MAGIC_V3)) {
+                CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n",
+                        lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3);
+                RETURN(-EINVAL);
+
         }
 
         /* XXX LOV STACKING call into osc for sizes */
-        lmm_size = lov_mds_md_size(stripe_count);
+        lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
 
         if (!lmmp)
                 RETURN(lmm_size);
 
         if (*lmmp && !lsm) {
                 stripe_count = le32_to_cpu((*lmmp)->lmm_stripe_count);
-                OBD_FREE(*lmmp, lov_mds_md_size(stripe_count));
+                lmm_size = lov_mds_md_size(stripe_count, lmm_magic);
+                OBD_FREE(*lmmp, lmm_size);
                 *lmmp = NULL;
                 RETURN(0);
         }
@@ -154,28 +214,44 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp,
                         RETURN(-ENOMEM);
         }
 
-        lmm = *lmmp;
-        lmm->lmm_magic = cpu_to_le32(LOV_MAGIC); /* only write new format */
+        CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n",
+               lmm_magic, lmm_size);
+
+        lmmv1 = *lmmp;
+        lmmv3 = (struct lov_mds_md_v3 *)*lmmp;
+        if (lmm_magic == LOV_MAGIC_V3)
+                lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3);
+        else
+                lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1);
 
         if (!lsm)
                 RETURN(lmm_size);
 
-        lmm->lmm_object_id = cpu_to_le64(lsm->lsm_object_id);
-        lmm->lmm_object_gr = cpu_to_le64(lsm->lsm_object_gr);
-        lmm->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
-        lmm->lmm_stripe_count = cpu_to_le32(stripe_count);
-        lmm->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+        /* lmmv1 and lmmv3 point to the same struct and have the
+         * same first fields
+         */
+        lmmv1->lmm_object_id = cpu_to_le64(lsm->lsm_object_id);
+        lmmv1->lmm_object_gr = cpu_to_le64(lsm->lsm_object_gr);
+        lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size);
+        lmmv1->lmm_stripe_count = cpu_to_le32(stripe_count);
+        lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern);
+        if (lsm->lsm_magic == LOV_MAGIC_V3) {
+                strncpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name, MAXPOOLNAME);
+                lmm_objects = lmmv3->lmm_objects;
+        } else {
+                lmm_objects = lmmv1->lmm_objects;
+        }
 
         for (i = 0; i < stripe_count; i++) {
                 struct lov_oinfo *loi = lsm->lsm_oinfo[i];
 
                 /* XXX LOV STACKING call down to osc_packmd() to do packing */
                 LASSERTF(loi->loi_id, "lmm_oid "LPU64" stripe %u/%u idx %u\n",
-                         lmm->lmm_object_id, i, stripe_count, loi->loi_ost_idx);
-                lmm->lmm_objects[i].l_object_id = cpu_to_le64(loi->loi_id);
-                lmm->lmm_objects[i].l_object_gr = cpu_to_le64(loi->loi_gr);
-                lmm->lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
-                lmm->lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
+                         lmmv1->lmm_object_id, i, stripe_count, loi->loi_ost_idx);
+                lmm_objects[i].l_object_id = cpu_to_le64(loi->loi_id);
+                lmm_objects[i].l_object_gr = cpu_to_le64(loi->loi_gr);
+                lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen);
+                lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx);
         }
 
         RETURN(lmm_size);
@@ -205,9 +281,22 @@ static int lov_verify_lmm(void *lmm, int lmm_bytes, int *stripe_count)
         int rc;
 
         if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) {
-                CERROR("bad disk LOV MAGIC: 0x%08X; dumping V1 LMM:\n",
-                       le32_to_cpu(*(__u32 *)lmm));
-                lov_dump_lmm_v1(D_WARNING, lmm);
+                char *buffer;
+                int sz;
+
+                CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n",
+                       le32_to_cpu(*(__u32 *)lmm), lmm_bytes);
+                sz = lmm_bytes * 2 + 1;
+                OBD_ALLOC(buffer, sz);
+                if (buffer != NULL) {
+                        int i;
+
+                        for (i = 0; i < lmm_bytes; i++)
+                                sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]);
+                        buffer[sz] = '\0';
+                        CERROR("%s\n", buffer);
+                        OBD_FREE(buffer, sz);
+                }
                 return -EINVAL;
         }
         rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm,
@@ -234,6 +323,7 @@ int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count,
         (*lsmp)->lsm_stripe_count = stripe_count;
         (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count;
         (*lsmp)->lsm_pattern = pattern;
+        (*lsmp)->lsm_pool_name[0] = '\0';
         (*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0;
 
         for (i = 0; i < stripe_count; i++)
@@ -312,68 +402,108 @@ static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp,
 {
         struct obd_device *obd = class_exp2obd(exp);
         struct lov_obd *lov = &obd->u.lov;
-        struct lov_user_md lum;
+        struct lov_user_md_v3 lumv3;
+        struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
+        int lmm_magic;
         int stripe_count;
         int rc;
         ENTRY;
 
-        rc = copy_from_user(&lum, lump, sizeof(lum));
+        rc = copy_from_user(&lumv3, lump, sizeof(struct lov_user_md_v1));
         if (rc)
                 RETURN(-EFAULT);
 
-        if (lum.lmm_magic != LOV_USER_MAGIC) {
-                if (lum.lmm_magic == __swab32(LOV_USER_MAGIC)) {
-                        lustre_swab_lov_user_md(&lum);
-                } else {
-                        CDEBUG(D_IOCTL, "bad userland LOV MAGIC:"
-                               " %#08x != %#08x\n",
-                               lum.lmm_magic, LOV_USER_MAGIC);
-                        RETURN(-EINVAL);
-                }
+        lmm_magic = lumv1->lmm_magic;
+
+        if (lmm_magic == __swab32(LOV_USER_MAGIC_V1)) {
+                lustre_swab_lov_user_md_v1(lumv1);
+                lmm_magic = LOV_USER_MAGIC_V1;
+        } else if (lmm_magic == LOV_USER_MAGIC_V3) {
+                rc = copy_from_user(&lumv3, lump, sizeof(lumv3));
+                if (rc)
+                        RETURN(-EFAULT);
+        } else if (lmm_magic == __swab32(LOV_USER_MAGIC_V3)) {
+                rc = copy_from_user(&lumv3, lump, sizeof(lumv3));
+                if (rc)
+                        RETURN(-EFAULT);
+                lustre_swab_lov_user_md_v3(&lumv3);
+                lmm_magic = LOV_USER_MAGIC_V3;
+        } else if (lmm_magic != LOV_USER_MAGIC_V1) {
+                CDEBUG(D_IOCTL,
+                       "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n",
+                       lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3);
+                       RETURN(-EINVAL);
         }
 
-        if (lum.lmm_pattern == 0) {
-                lum.lmm_pattern = lov->desc.ld_pattern ?
+        /* in the rest of the tests, as *lumv1 and lumv3 have the same
+         * fields, we use lumv1 to avoid code duplication */
+
+        if (lumv1->lmm_pattern == 0) {
+                lumv1->lmm_pattern = lov->desc.ld_pattern ?
                         lov->desc.ld_pattern : LOV_PATTERN_RAID0;
         }
 
-        if (lum.lmm_pattern != LOV_PATTERN_RAID0) {
+        if (lumv1->lmm_pattern != LOV_PATTERN_RAID0) {
                 CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n",
-                       lum.lmm_pattern);
+                       lumv1->lmm_pattern);
                 RETURN(-EINVAL);
         }
 
         /* 64kB is the largest common page size we see (ia64), and matches the
          * check in lfs */
-        if (lum.lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
+        if (lumv1->lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) {
                 CDEBUG(D_IOCTL, "stripe size %u not multiple of %u, fixing\n",
-                       lum.lmm_stripe_size, LOV_MIN_STRIPE_SIZE);
-                lum.lmm_stripe_size = LOV_MIN_STRIPE_SIZE;
+                       lumv1->lmm_stripe_size, LOV_MIN_STRIPE_SIZE);
+                lumv1->lmm_stripe_size = LOV_MIN_STRIPE_SIZE;
         }
 
-        if ((lum.lmm_stripe_offset >= lov->desc.ld_tgt_count) &&
-            (lum.lmm_stripe_offset != (typeof(lum.lmm_stripe_offset))(-1))) {
+        if ((lumv1->lmm_stripe_offset >= lov->desc.ld_tgt_count) &&
+            (lumv1->lmm_stripe_offset !=
+             (typeof(lumv1->lmm_stripe_offset))(-1))) {
                 CDEBUG(D_IOCTL, "stripe offset %u > number of OSTs %u\n",
-                       lum.lmm_stripe_offset, lov->desc.ld_tgt_count);
+                       lumv1->lmm_stripe_offset, lov->desc.ld_tgt_count);
                 RETURN(-EINVAL);
         }
-        stripe_count = lov_get_stripecnt(lov, lum.lmm_stripe_count);
+        stripe_count = lov_get_stripecnt(lov, lumv1->lmm_stripe_count);
+
+        if (lmm_magic == LOV_USER_MAGIC_V3) {
+                struct pool_desc *pool;
+
+                pool = lov_find_pool(lov, lumv3.lmm_pool_name);
+                if (pool == NULL)
+                        RETURN(-EINVAL);
+
+                if (lumv3.lmm_stripe_offset !=
+                    (typeof(lumv3.lmm_stripe_offset))(-1)) {
+                        rc = lov_check_index_in_pool(lumv3.lmm_stripe_offset,
+                                                     pool);
+                        if (rc < 0)
+                                RETURN(-EINVAL);
+                }
+
+                if (stripe_count > pool_tgt_count(pool))
+                        stripe_count = pool_tgt_count(pool);
+        }
 
-        if ((__u64)lum.lmm_stripe_size * stripe_count > ~0UL) {
+        if ((__u64)lumv1->lmm_stripe_size * stripe_count > ~0UL) {
                 CDEBUG(D_IOCTL, "stripe width %ux%i exeeds %lu bytes\n",
-                       lum.lmm_stripe_size, (int)lum.lmm_stripe_count, ~0UL);
+                       lumv1->lmm_stripe_size, (int)lumv1->lmm_stripe_count,
+                       ~0UL);
                 RETURN(-EINVAL);
         }
 
-        rc = lov_alloc_memmd(lsmp, stripe_count, lum.lmm_pattern, LOV_MAGIC);
+        rc = lov_alloc_memmd(lsmp, stripe_count, lumv1->lmm_pattern, lmm_magic);
 
         if (rc >= 0) {
-                (*lsmp)->lsm_oinfo[0]->loi_ost_idx = lum.lmm_stripe_offset;
-                (*lsmp)->lsm_stripe_size = lum.lmm_stripe_size;
+                (*lsmp)->lsm_oinfo[0]->loi_ost_idx = lumv1->lmm_stripe_offset;
+                (*lsmp)->lsm_stripe_size = lumv1->lmm_stripe_size;
+                if (lmm_magic == LOV_USER_MAGIC_V3)
+                        strncpy((*lsmp)->lsm_pool_name, lumv3.lmm_pool_name,
+                                MAXPOOLNAME);
                 rc = 0;
         }
 
-        RETURN(0);
+        RETURN(rc);
 }
 
 /* Configure object striping information on a new file.
@@ -405,20 +535,27 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
         struct obd_export *oexp;
         struct lov_obd *lov = &exp->exp_obd->u.lov;
         obd_id last_id = 0;
+        struct lov_user_ost_data_v1 *lmm_objects;
 
         ENTRY;
+
+        if (lump->lmm_magic == LOV_USER_MAGIC_V3)
+                lmm_objects = ((struct lov_user_md_v3 *)lump)->lmm_objects;
+        else
+                lmm_objects = lump->lmm_objects;
+
         for (i = 0; i < lump->lmm_stripe_count; i++) {
                 __u32 len = sizeof(last_id);
-                oexp = lov->lov_tgts[lump->lmm_objects[i].l_ost_idx]->ltd_exp;
+                oexp = lov->lov_tgts[lmm_objects[i].l_ost_idx]->ltd_exp;
                 rc = obd_get_info(oexp, sizeof(KEY_LAST_ID), KEY_LAST_ID,
                                   &len, &last_id, NULL);
                 if (rc)
                         RETURN(rc);
-                if (lump->lmm_objects[i].l_object_id > last_id) {
+                if (lmm_objects[i].l_object_id > last_id) {
                         CERROR("Setting EA for object > than last id on "
                                "ost idx %d "LPD64" > "LPD64" \n",
-                               lump->lmm_objects[i].l_ost_idx,
-                               lump->lmm_objects[i].l_object_id, last_id);
+                               lmm_objects[i].l_ost_idx,
+                               lmm_objects[i].l_object_id, last_id);
                         RETURN(-EINVAL);
                 }
         }
@@ -429,9 +566,9 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp,
 
         for (i = 0; i < lump->lmm_stripe_count; i++) {
                 (*lsmp)->lsm_oinfo[i]->loi_ost_idx =
-                        lump->lmm_objects[i].l_ost_idx;
-                (*lsmp)->lsm_oinfo[i]->loi_id = lump->lmm_objects[i].l_object_id;
-                (*lsmp)->lsm_oinfo[i]->loi_gr = lump->lmm_objects[i].l_object_gr;
+                        lmm_objects[i].l_ost_idx;
+                (*lsmp)->lsm_oinfo[i]->loi_id = lmm_objects[i].l_object_id;
+                (*lsmp)->lsm_oinfo[i]->loi_gr = lmm_objects[i].l_object_gr;
         }
         RETURN(0);
 }
@@ -449,9 +586,11 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
         /*
          * XXX huge struct allocated on stack.
          */
-        struct lov_user_md lum;
+        /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+        struct lov_user_md_v3 lum;
         struct lov_mds_md *lmmk = NULL;
         int rc, lmm_size;
+        int lum_size;
         mm_segment_t seg;
         ENTRY;
 
@@ -464,12 +603,22 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
          */
         seg = get_fs();
         set_fs(KERNEL_DS);
-        rc = copy_from_user(&lum, lump, sizeof(lum));
+
+        /* we only need the header part from user space to get lmm_magic and
+         * lmm_stripe_count, (the header part is common to v1 and v3) */
+        lum_size = sizeof(struct lov_user_md_v1);
+        rc = copy_from_user(&lum, lump, lum_size);
+
         if (rc)
                 rc = -EFAULT;
-        else if (lum.lmm_magic != LOV_USER_MAGIC)
+        else if ((lum.lmm_magic != LOV_USER_MAGIC) &&
+                 (lum.lmm_magic != LOV_USER_MAGIC_V3))
                 rc = -EINVAL;
         else {
+                /* if v3 we just have to update the lum_size */
+                if (lum.lmm_magic == LOV_USER_MAGIC_V3)
+                        lum_size = sizeof(struct lov_user_md_v3);
+
                 rc = lov_packmd(exp, &lmmk, lsm);
                 if (rc < 0)
                         RETURN(rc);
@@ -477,17 +626,18 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm,
                 rc = 0;
 
                 /* FIXME: Bug 1185 - copy fields properly when structs change */
-                CLASSERT(sizeof lum == sizeof *lmmk);
+                /* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */
+                CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3));
                 CLASSERT(sizeof lum.lmm_objects[0] ==
                          sizeof lmmk->lmm_objects[0]);
 
                 /* User wasn't expecting this many OST entries */
                 if (lum.lmm_stripe_count == 0) {
-                        if (copy_to_user(lump, lmmk, sizeof lum))
+                        if (copy_to_user(lump, lmmk, lum_size))
                                 rc = -EFAULT;
                 } else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) {
                         rc = -EOVERFLOW;
-                } else if (copy_to_user(lump, lmmk, sizeof lum))
+                } else if (copy_to_user(lump, lmmk, lmm_size))
                         rc = -EFAULT;
 
                 obd_free_diskmd(exp, &lmmk);
diff --git a/lustre/lov/lov_pool.c b/lustre/lov/lov_pool.c
new file mode 100644 (file)
index 0000000..05fde47
--- /dev/null
@@ -0,0 +1,619 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see [sun.com URL with a
+ * copy of GPLv2].
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/lov/lov_pool.c
+ *
+ * OST pool methods
+ *
+ * Author: Jacques-Charles LAFOUCRIERE <jc.lafoucriere@cea.fr>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#ifdef __KERNEL__
+#include <libcfs/libcfs.h>
+#else
+#include <liblustre.h>
+#endif
+
+#include <obd.h>
+#include "lov_internal.h"
+
+/*
+ * hash function using a Rotating Hash algorithm
+ * Knuth, D. The Art of Computer Programming,
+ * Volume 3: Sorting and Searching,
+ * Chapter 6.4.
+ * Addison Wesley, 1973
+ */
+static __u32 pool_hashfn(lustre_hash_t *hash_body, void *key, unsigned mask)
+{
+        int i;
+        __u32 result;
+        char *poolname;
+
+        result = 0;
+        poolname = (char *)key;
+        for (i = 0; i < MAXPOOLNAME; i++) {
+                if (poolname[i] == '\0')
+                        break;
+                result = (result << 4)^(result >> 28) ^  poolname[i];
+        }
+        return (result % mask);
+}
+
+static void *pool_key(struct hlist_node *hnode)
+{
+        struct pool_desc *pool;
+
+        pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+        return (pool->pool_name);
+}
+
+static int pool_hashkey_compare(void *key, struct hlist_node *compared_hnode)
+{
+        char *pool_name;
+        struct pool_desc *pool;
+        int rc;
+
+        pool_name = (char *)key;
+        pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash);
+        rc = strncmp(pool_name, pool->pool_name, MAXPOOLNAME);
+        return (!rc);
+}
+
+static void *pool_hashrefcount_get(struct hlist_node *hnode)
+{
+        struct pool_desc *pool;
+
+        pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+        return (pool);
+}
+
+static void *pool_hashrefcount_put(struct hlist_node *hnode)
+{
+        struct pool_desc *pool;
+
+        pool = hlist_entry(hnode, struct pool_desc, pool_hash);
+        return (pool);
+}
+
+lustre_hash_ops_t pool_hash_operations = {
+        .lh_hash        = pool_hashfn,
+        .lh_key         = pool_key,
+        .lh_compare     = pool_hashkey_compare,
+        .lh_get         = pool_hashrefcount_get,
+        .lh_put         = pool_hashrefcount_put,
+};
+
+#ifdef LPROCFS
+/* ifdef needed for liblustre support */
+/*
+ * pool /proc seq_file methods
+ */
+/*
+ * iterator is used to go through the target pool entries
+ * index is the current entry index in the lp_array[] array
+ * index >= pos returned to the seq_file interface
+ * pos is from 0 to (pool->pool_obds.op_count - 1)
+ */
+#define POOL_IT_MAGIC 0xB001CEA0
+struct pool_iterator {
+        int magic;
+        struct pool_desc *pool;
+        int idx;        /* from 0 to pool_tgt_size - 1 */
+};
+
+static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)s->private;
+        int prev_idx;
+
+        LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+
+        /* test if end of file */
+        if (*pos >= pool_tgt_count(iter->pool))
+                return NULL;
+
+        /* iterate to find a non empty entry */
+        prev_idx = iter->idx;
+        read_lock(&pool_tgt_rwlock(iter->pool));
+        iter->idx++;
+        if (iter->idx == pool_tgt_count(iter->pool)) {
+                iter->idx = prev_idx; /* we stay on the last entry */
+                read_unlock(&pool_tgt_rwlock(iter->pool));
+                return NULL;
+        }
+        read_unlock(&pool_tgt_rwlock(iter->pool));
+        (*pos)++;
+        /* return != NULL to continue */
+        return iter;
+}
+
+static void *pool_proc_start(struct seq_file *s, loff_t *pos)
+{
+        struct pool_desc *pool = (struct pool_desc *)s->private;
+        struct pool_iterator *iter;
+
+        if ((pool_tgt_count(pool) == 0) ||
+            (*pos >= pool_tgt_count(pool)))
+                return NULL;
+
+        OBD_ALLOC(iter, sizeof(struct pool_iterator));
+        if (!iter)
+                return ERR_PTR(-ENOMEM);
+        iter->magic = POOL_IT_MAGIC;
+        iter->pool = pool;
+        iter->idx = 0;
+
+        /* we use seq_file private field to memorized iterator so
+         * we can free it at stop() */
+        /* /!\ do not forget to restore it to pool before freeing it */
+        s->private = iter;
+        if (*pos > 0) {
+                loff_t i;
+                void *ptr;
+
+                i = 0;
+                do {
+                     ptr = pool_proc_next(s, &iter, &i);
+                } while ((i < *pos) && (ptr != NULL));
+                return ptr;
+        }
+        return iter;
+}
+
+static void pool_proc_stop(struct seq_file *s, void *v)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)s->private;
+
+        /* in some cases stop() method is called 2 times, without
+         * calling start() method (see seq_read() from fs/seq_file.c)
+         * we have to free only if s->private is an iterator */
+        if ((iter) && (iter->magic == POOL_IT_MAGIC)) {
+                /* we restore s->private so next call to pool_proc_start()
+                 * will work */
+                s->private = iter->pool;
+                OBD_FREE(iter, sizeof(struct pool_iterator));
+        }
+        return;
+}
+
+static int pool_proc_show(struct seq_file *s, void *v)
+{
+        struct pool_iterator *iter = (struct pool_iterator *)v;
+        struct lov_tgt_desc *tgt;
+
+        LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic);
+        LASSERT(iter->pool != NULL);
+        LASSERT(iter->idx <= pool_tgt_count(iter->pool));
+
+        read_lock(&pool_tgt_rwlock(iter->pool));
+        tgt = pool_tgt(iter->pool, iter->idx);
+        read_unlock(&pool_tgt_rwlock(iter->pool));
+        if (tgt)
+                seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid)));
+
+        return 0;
+}
+
+static struct seq_operations pool_proc_ops = {
+        .start          = pool_proc_start,
+        .next           = pool_proc_next,
+        .stop           = pool_proc_stop,
+        .show           = pool_proc_show,
+};
+
+static int pool_proc_open(struct inode *inode, struct file *file)
+{
+        int rc;
+
+        rc = seq_open(file, &pool_proc_ops);
+        if (!rc) {
+                struct seq_file *s = file->private_data;
+                s->private = PROC_I(inode)->pde->data;
+        }
+        return rc;
+}
+
+static struct file_operations pool_proc_operations = {
+        .open           = pool_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+#endif /* LPROCFS */
+
+void lov_dump_pool(int level, struct pool_desc *pool)
+{
+        int i;
+
+        CDEBUG(level, "pool "POOLNAMEF" has %d members\n",
+               pool->pool_name, pool->pool_obds.op_count);
+        read_lock(&pool_tgt_rwlock(pool));
+        for (i = 0; i < pool_tgt_count(pool) ; i++) {
+                if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp)
+                        continue;
+                CDEBUG(level, "pool "POOLNAMEF"[%d] = %s\n", pool->pool_name,
+                       i, obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid)));
+        }
+        read_unlock(&pool_tgt_rwlock(pool));
+}
+
+#define LOV_POOL_INIT_COUNT 2
+int lov_ost_pool_init(struct ost_pool *op, unsigned int count)
+{
+        if (count == 0)
+                count = LOV_POOL_INIT_COUNT;
+        op->op_array = NULL;
+        op->op_count = 0;
+        op->op_rwlock = RW_LOCK_UNLOCKED;
+        op->op_size = count;
+        OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0]));
+        if (op->op_array == NULL) {
+                op->op_size = 0;
+                return -ENOMEM;
+        }
+        return 0;
+}
+
+int lov_ost_pool_extend(struct ost_pool *op, unsigned int max_count)
+{
+        __u32 *new;
+        int new_size;
+
+        LASSERT(max_count != 0);
+
+        if (op->op_count < op->op_size)
+                return 0;
+
+        new_size = min(max_count, 2 * op->op_size);
+        OBD_ALLOC(new, new_size * sizeof(op->op_array[0]));
+        if (new == NULL)
+                return -ENOMEM;
+
+        /* copy old array to new one */
+        memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0]));
+        write_lock(&op->op_rwlock);
+        OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+        op->op_array = new;
+        op->op_size = new_size;
+        write_unlock(&op->op_rwlock);
+        return 0;
+}
+
+int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int max_count)
+{
+        int rc, i;
+
+        rc = lov_ost_pool_extend(op, max_count);
+        if (rc)
+                return rc;
+
+        /* search ost in pool array */
+        read_lock(&op->op_rwlock);
+        for (i = 0; i < op->op_count; i++) {
+                if (op->op_array[i] == idx) {
+                        read_unlock(&op->op_rwlock);
+                        return -EEXIST;
+                }
+        }
+        /* ost not found we add it */
+        op->op_array[op->op_count] = idx;
+        op->op_count++;
+        read_unlock(&op->op_rwlock);
+        return 0;
+}
+
+int lov_ost_pool_remove(struct ost_pool *op, __u32 idx)
+{
+        int i;
+
+        read_lock(&op->op_rwlock);
+        for (i = 0; i < op->op_count; i++) {
+                if (op->op_array[i] == idx) {
+                        memmove(&op->op_array[i], &op->op_array[i + 1],
+                                (op->op_count - i - 1) * sizeof(op->op_array[0]));
+                        op->op_count--;
+                        read_unlock(&op->op_rwlock);
+                        return 0;
+                }
+        }
+        read_unlock(&op->op_rwlock);
+        return -EINVAL;
+}
+
+int lov_ost_pool_free(struct ost_pool *op)
+{
+        if (op->op_size == 0)
+                return 0;
+
+        write_lock(&op->op_rwlock);
+        OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0]));
+        op->op_array = NULL;
+        op->op_count = 0;
+        op->op_size = 0;
+        write_unlock(&op->op_rwlock);
+        return 0;
+}
+
+
+int lov_pool_new(struct obd_device *obd, char *poolname)
+{
+        struct lov_obd *lov;
+        struct pool_desc *new_pool;
+        int rc;
+
+        lov = &(obd->u.lov);
+
+        OBD_ALLOC(new_pool, sizeof(*new_pool));
+
+        if (new_pool == NULL)
+                return -ENOMEM;
+
+        if (strlen(poolname) > MAXPOOLNAME)
+                return -ENAMETOOLONG;
+
+        strncpy(new_pool->pool_name, poolname, MAXPOOLNAME);
+        new_pool->pool_name[MAXPOOLNAME] = '\0';
+        new_pool->pool_lov = lov;
+        rc = lov_ost_pool_init(&new_pool->pool_obds, 0);
+        if (rc)
+                return rc;
+
+        memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr));
+        rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0);
+        if (rc)
+                return rc;
+
+        spin_lock(&obd->obd_dev_lock);
+        /* check if pool alreaddy exists */
+        if (lustre_hash_lookup(lov->lov_pools_hash_body,
+                                poolname) != NULL) {
+                spin_unlock(&obd->obd_dev_lock);
+                lov_ost_pool_free(&new_pool->pool_obds);
+                OBD_FREE(new_pool, sizeof(*new_pool));
+                return  -EEXIST;
+        }
+
+        INIT_HLIST_NODE(&new_pool->pool_hash);
+        lustre_hash_add_unique(lov->lov_pools_hash_body, poolname,
+                               &new_pool->pool_hash);
+        list_add_tail(&new_pool->pool_list, &lov->lov_pool_list);
+        lov->lov_pool_count++;
+        spin_unlock(&obd->obd_dev_lock);
+
+        CDEBUG(D_CONFIG, POOLNAMEF" is pool #%d\n",
+               poolname, lov->lov_pool_count);
+
+#ifdef LPROCFS
+        /* ifdef needed for liblustre */
+        new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry,
+                                                       poolname,
+                                                       NULL, NULL,
+                                                       new_pool,
+                                                       &pool_proc_operations);
+#endif
+
+        if (IS_ERR(new_pool->pool_proc_entry)) {
+                CWARN("Cannot add proc pool entry "POOLNAMEF"\n", poolname);
+                new_pool->pool_proc_entry = NULL;
+        }
+
+        return 0;
+}
+
+int lov_pool_del(struct obd_device *obd, char *poolname)
+{
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+
+        lov = &(obd->u.lov);
+
+        spin_lock(&obd->obd_dev_lock);
+        pool = lustre_hash_lookup(lov->lov_pools_hash_body,
+                                             poolname);
+        if (pool == NULL) {
+                spin_unlock(&obd->obd_dev_lock);
+                return -ENOENT;
+        }
+
+#ifdef LPROCFS
+        if (pool->pool_proc_entry != NULL)
+                remove_proc_entry(pool->pool_proc_entry->name,
+                                  pool->pool_proc_entry->parent);
+#endif
+
+        /* pool is kept in the list to be freed by lov_cleanup()
+         * list_del(&pool->pool_list);
+         */
+        lustre_hash_del_key(lov->lov_pools_hash_body, poolname);
+
+        lov->lov_pool_count--;
+
+        spin_unlock(&obd->obd_dev_lock);
+
+        /* pool struct is not freed because it may be used by
+         * some open in /proc
+         * the struct is freed at lov_cleanup()
+         */
+        /*
+        if (pool->pool_rr.lqr_size != 0)
+                OBD_FREE(pool->pool_rr.lqr_array, pool->pool_rr.lqr_size);
+        lov_ost_pool_free(&pool->pool_obds);
+        OBD_FREE(pool, sizeof(*pool));
+        */
+        return 0;
+}
+
+
+int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname)
+{
+        struct obd_uuid ost_uuid;
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        unsigned int i, lov_idx;
+        int rc;
+
+        lov = &(obd->u.lov);
+
+        pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname);
+        if (pool == NULL) {
+                return -ENOENT;
+        }
+
+        /* allocate pool tgt array if needed */
+        mutex_down(&lov->lov_lock);
+        rc = lov_ost_pool_extend(&pool->pool_obds, lov->lov_tgt_size);
+        if (rc) {
+                mutex_up(&lov->lov_lock);
+                return rc;
+        }
+        mutex_up(&lov->lov_lock);
+
+        obd_str2uuid(&ost_uuid, ostname);
+
+        spin_lock(&obd->obd_dev_lock);
+
+        /* search ost in lov array */
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                if (!lov->lov_tgts[i])
+                        continue;
+
+                if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid)))
+                        break;
+        }
+
+        /* test if ost found in lov */
+        if (i == lov->desc.ld_tgt_count) {
+                spin_unlock(&obd->obd_dev_lock);
+                return -EINVAL;
+        }
+
+        spin_unlock(&obd->obd_dev_lock);
+
+        lov_idx = i;
+
+        rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size);
+        if (rc)
+                return rc;
+
+        pool->pool_rr.lqr_dirty = 1;
+
+        CDEBUG(D_CONFIG, "Added %s to "POOLNAMEF" as member %d\n",
+               ostname, poolname,  pool_tgt_count(pool));
+        return 0;
+}
+
+int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname)
+{
+        struct obd_uuid ost_uuid;
+        struct lov_obd *lov;
+        struct pool_desc *pool;
+        unsigned int i, lov_idx;
+
+        lov = &(obd->u.lov);
+
+        spin_lock(&obd->obd_dev_lock);
+        pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname);
+        if (pool == NULL) {
+                spin_unlock(&obd->obd_dev_lock);
+                return -ENOENT;
+        }
+
+        obd_str2uuid(&ost_uuid, ostname);
+
+        /* search ost in lov array, to get index */
+        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+                if (!lov->lov_tgts[i])
+                        continue;
+
+                if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid)))
+                        break;
+        }
+
+        /* test if ost found in lov */
+        if (i == lov->desc.ld_tgt_count) {
+                spin_unlock(&obd->obd_dev_lock);
+                return -EINVAL;
+        }
+
+        spin_unlock(&obd->obd_dev_lock);
+
+        lov_idx = i;
+
+        lov_ost_pool_remove(&pool->pool_obds, lov_idx);
+
+        pool->pool_rr.lqr_dirty = 1;
+
+        CDEBUG(D_CONFIG, "%s removed from "POOLNAMEF"\n", ostname, poolname);
+
+        return 0;
+}
+
+int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool)
+{
+        int i;
+
+        read_lock(&pool_tgt_rwlock(pool));
+        for (i = 0; i < pool_tgt_count(pool); i++) {
+                if (pool_tgt_array(pool)[i] == idx) {
+                        read_unlock(&pool_tgt_rwlock(pool));
+                        return 0;
+                }
+        }
+        read_unlock(&pool_tgt_rwlock(pool));
+        return -ENOENT;
+}
+
+struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname)
+{
+        struct pool_desc *pool;
+
+        pool = NULL;
+        if (poolname[0] != '\0') {
+                pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname);
+                if (pool == NULL)
+                        CWARN("Request for an unknown pool ("POOLNAMEF")\n",
+                              poolname);
+                if ((pool != NULL) && (pool_tgt_count(pool) == 0)) {
+                        CWARN("Request for an empty pool ("POOLNAMEF")\n",
+                               poolname);
+                        pool = NULL;
+                }
+        }
+        return pool;
+}
+
index 053ef6c..4a97573 100644 (file)
@@ -108,7 +108,7 @@ int qos_add_tgt(struct obd_device *obd, __u32 index)
         list_add_tail(&oss->lqo_oss_list, &temposs->lqo_oss_list);
 
         lov->lov_qos.lq_dirty = 1;
-        lov->lov_qos.lq_dirty_rr = 1;
+        lov->lov_qos.lq_rr.lqr_dirty = 1;
 
         CDEBUG(D_QOS, "add tgt %s to OSS %s (%d OSTs)\n",
                obd_uuid2str(&lov->lov_tgts[index]->ltd_uuid),
@@ -146,7 +146,7 @@ int qos_del_tgt(struct obd_device *obd, __u32 index)
         }
 
         lov->lov_qos.lq_dirty = 1;
-        lov->lov_qos.lq_dirty_rr = 1;
+        lov->lov_qos.lq_rr.lqr_dirty = 1;
 out:
         up_write(&lov->lov_qos.lq_rw_sem);
         RETURN(rc);
@@ -268,10 +268,11 @@ static int qos_calc_weight(struct lov_obd *lov, int i)
 }
 
 /* We just used this index for a stripe; adjust everyone's weights */
-static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt)
+static int qos_used(struct lov_obd *lov, struct ost_pool *osts,
+                    __u32 index, __u64 *total_wt)
 {
         struct lov_qos_oss *oss;
-        int i;
+        int j;
         ENTRY;
 
         /* Don't allocate from this stripe anymore, until the next alloc_qos */
@@ -301,7 +302,10 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt)
 
         *total_wt = 0;
         /* Decrease all OST penalties */
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
+        for (j = 0; j < osts->op_count; j++) {
+                int i;
+
+                i = osts->op_array[j];
                 if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
                         continue;
                 if (lov->lov_tgts[i]->ltd_qos.ltq_penalty <
@@ -318,10 +322,11 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt)
                         *total_wt += lov->lov_tgts[i]->ltd_qos.ltq_weight;
 
 #ifdef QOS_DEBUG
-                CDEBUG(D_QOS, "recalc tgt %d avail="LPU64
+                CDEBUG(D_QOS, "recalc tgt %d usable=%d avail="LPU64
                        " ostppo="LPU64" ostp="LPU64" ossppo="LPU64
                        " ossp="LPU64" wt="LPU64"\n",
-                       i, TGT_BAVAIL(i) >> 10,
+                       i, lov->lov_tgts[i]->ltd_qos.ltq_usable,
+                       TGT_BAVAIL(i) >> 10,
                        lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj >> 10,
                        lov->lov_tgts[i]->ltd_qos.ltq_penalty >> 10,
                        lov->lov_tgts[i]->ltd_qos.ltq_oss->lqo_penalty_per_obj>>10,
@@ -335,15 +340,16 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt)
 
 #define LOV_QOS_EMPTY ((__u32)-1)
 /* compute optimal round-robin order, based on OSTs per OSS */
-static int qos_calc_rr(struct lov_obd *lov)
+static int qos_calc_rr(struct lov_obd *lov, struct ost_pool *src_pool,
+                       struct lov_qos_rr *lqr)
 {
         struct lov_qos_oss *oss;
-        unsigned ost_count, placed, real_count;
-        int i;
+        unsigned placed, real_count;
+        int i, rc;
         ENTRY;
 
-        if (!lov->lov_qos.lq_dirty_rr) {
-                LASSERT(lov->lov_qos.lq_rr_size);
+        if (!lqr->lqr_dirty) {
+                LASSERT(lqr->lqr_pool.op_size);
                 RETURN(0);
         }
 
@@ -354,54 +360,45 @@ static int qos_calc_rr(struct lov_obd *lov)
          * Check again. While we were sleeping on @lq_rw_sem something could
          * change.
          */
-        if (!lov->lov_qos.lq_dirty_rr) {
-                LASSERT(lov->lov_qos.lq_rr_size);
+        if (!lqr->lqr_dirty) {
+                LASSERT(lqr->lqr_pool.op_size);
                 up_write(&lov->lov_qos.lq_rw_sem);
                 RETURN(0);
         }
 
-        ost_count = lov->desc.ld_tgt_count;
-
-        if (lov->lov_qos.lq_rr_size)
-                OBD_FREE(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size);
-        lov->lov_qos.lq_rr_size = ost_count *
-                sizeof(lov->lov_qos.lq_rr_array[0]);
-        OBD_ALLOC(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size);
-        if (!lov->lov_qos.lq_rr_array) {
-                lov->lov_qos.lq_rr_size = 0;
+        if (lqr->lqr_pool.op_size)
+                lov_ost_pool_free(&lqr->lqr_pool);
+        rc = lov_ost_pool_init(&lqr->lqr_pool, src_pool->op_count);
+        if (rc) {
                 up_write(&lov->lov_qos.lq_rw_sem);
-                RETURN(-ENOMEM);
+                RETURN(rc);
         }
 
-        real_count = 0;
-        for (i = 0; i < ost_count; i++) {
-                lov->lov_qos.lq_rr_array[i] = LOV_QOS_EMPTY;
-                if (lov->lov_tgts[i])
-                        real_count++;
-        }
+        for (i = 0; i < src_pool->op_count; i++)
+                lqr->lqr_pool.op_array[i] = LOV_QOS_EMPTY;
+        lqr->lqr_pool.op_count = src_pool->op_count;
 
         /* Place all the OSTs from 1 OSS at the same time. */
+        real_count = lqr->lqr_pool.op_count;
         placed = 0;
         list_for_each_entry(oss, &lov->lov_qos.lq_oss_list, lqo_oss_list) {
                 int j = 0;
-                for (i = 0; i < ost_count; i++) {
-                        if (lov->lov_tgts[i] &&
-                            lov->lov_tgts[i]->ltd_qos.ltq_oss == oss) {
+                for (i = 0; i < lqr->lqr_pool.op_count; i++) {
+                        if (lov->lov_tgts[src_pool->op_array[i]] &&
+                            (lov->lov_tgts[src_pool->op_array[i]]->ltd_qos.ltq_oss == oss)) {
                               /* Evenly space these OSTs across arrayspace */
-                              int next = j * ost_count / oss->lqo_ost_count;
-                              LASSERT(next < ost_count);
-                              while (lov->lov_qos.lq_rr_array[next] !=
+                              int next = j * lqr->lqr_pool.op_count / oss->lqo_ost_count;
+                              while (lqr->lqr_pool.op_array[next] !=
                                      LOV_QOS_EMPTY)
-                                      next = (next + 1) % ost_count;
-                              lov->lov_qos.lq_rr_array[next] = i;
+                                        next = (next + 1) % lqr->lqr_pool.op_count;
+                              lqr->lqr_pool.op_array[next] = src_pool->op_array[i];
                               j++;
                               placed++;
                         }
                 }
-                LASSERT(j == oss->lqo_ost_count);
         }
 
-        lov->lov_qos.lq_dirty_rr = 0;
+        lqr->lqr_dirty = 0;
         up_write(&lov->lov_qos.lq_rw_sem);
 
         if (placed != real_count) {
@@ -409,18 +406,18 @@ static int qos_calc_rr(struct lov_obd *lov)
                 LCONSOLE_ERROR_MSG(0x14e, "Failed to place all OSTs in the "
                                    "round-robin list (%d of %d).\n",
                                    placed, real_count);
-                for (i = 0; i < ost_count; i++) {
+                for (i = 0; i < lqr->lqr_pool.op_count; i++) {
                         LCONSOLE(D_WARNING, "rr #%d ost idx=%d\n", i,
-                                 lov->lov_qos.lq_rr_array[i]);
+                                 lqr->lqr_pool.op_array[i]);
                 }
-                lov->lov_qos.lq_dirty_rr = 1;
+                lqr->lqr_dirty = 1;
                 RETURN(-EAGAIN);
         }
 
 #ifdef QOS_DEBUG
-        for (i = 0; i < ost_count; i++) {
+        for (i = 0; i < lqr->lqr_pool.op_count; i++) {
                 LCONSOLE(D_QOS, "rr #%d ost idx=%d\n", i,
-                         lov->lov_qos.lq_rr_array[i]);
+                         lqr->lqr_pool.op_array[i]);
         }
 #endif
 
@@ -519,54 +516,70 @@ static int min_stripe_count(int stripe_cnt, int flags)
 #define LOV_CREATE_RESEED_MIN  1000
 /* Allocate objects on osts with round-robin algorithm */
 static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt,
-                    int flags)
+                    char *poolname, int flags)
 {
-        unsigned array_idx, ost_count = lov->desc.ld_tgt_count;
-        unsigned ost_active_count = lov->desc.ld_active_tgt_count;
+        unsigned array_idx;
         int i, *idx_pos;
         __u32 ost_idx;
         int ost_start_idx_temp;
         int speed = 0;
         int stripe_cnt_min = min_stripe_count(*stripe_cnt, flags);
+        struct pool_desc *pool;
+        struct ost_pool *osts;
+        struct lov_qos_rr *lqr;
         ENTRY;
 
-        i = qos_calc_rr(lov);
-        if (i)
+        pool = lov_find_pool(lov, poolname);
+        if (pool == NULL) {
+                osts = &(lov->lov_packed);
+                lqr = &(lov->lov_qos.lq_rr);
+        } else {
+                read_lock(&pool_tgt_rwlock(pool));
+                osts = &(pool->pool_obds);
+                lqr = &(pool->pool_rr);
+        }
+
+        i = qos_calc_rr(lov, osts, lqr);
+        if (i) {
+                if (pool != NULL)
+                        read_unlock(&pool_tgt_rwlock(pool));
                 RETURN(i);
+        }
 
-        if (--lov->lov_start_count <= 0) {
-                lov->lov_start_idx = ll_rand() % ost_count;
-                lov->lov_start_count =
-                        (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) +
-                         LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U);
-        } else if (stripe_cnt_min >= ost_active_count ||
-                   lov->lov_start_idx > ost_count) {
+        if (--lqr->lqr_start_count <= 0) {
+                lqr->lqr_start_idx = ll_rand() % osts->op_count;
+                lqr->lqr_start_count =
+                        (LOV_CREATE_RESEED_MIN / max(osts->op_count, 1U) +
+                         LOV_CREATE_RESEED_MULT) * max(osts->op_count, 1U);
+        } else if (stripe_cnt_min >= osts->op_count ||
+                   lqr->lqr_start_idx > osts->op_count) {
                 /* If we have allocated from all of the OSTs, slowly
                  * precess the next start if the OST/stripe count isn't
                  * already doing this for us. */
-                lov->lov_start_idx %= ost_count;
-                if (*stripe_cnt > 1 && (ost_active_count % (*stripe_cnt)) != 1)
-                        ++lov->lov_offset_idx;
+                lqr->lqr_start_idx %= osts->op_count;
+                if (*stripe_cnt > 1 && (osts->op_count % (*stripe_cnt)) != 1)
+                        ++lqr->lqr_offset_idx;
         }
         down_read(&lov->lov_qos.lq_rw_sem);
-        ost_start_idx_temp = lov->lov_start_idx;
+        ost_start_idx_temp = lqr->lqr_start_idx;
 
 repeat_find:
-        array_idx = (lov->lov_start_idx + lov->lov_offset_idx) % ost_count;
+        array_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) % osts->op_count;
         idx_pos = idx_arr;
 #ifdef QOS_DEBUG
-        CDEBUG(D_QOS, "want %d startidx %d startcnt %d offset %d active %d "
-               "count %d arrayidx %d\n",
-               stripe_cnt, lov->lov_start_idx, lov->lov_start_count,
-               lov->lov_offset_idx, ost_active_count, ost_count, array_idx);
+        CDEBUG(D_QOS, "pool '%s' want %d startidx %d startcnt %d offset %d "
+               "active %d count %d arrayidx %d\n", poolname,
+               *stripe_cnt, lqr->lqr_start_idx, lqr->lqr_start_count,
+               lqr->lqr_offset_idx, osts->op_count, osts->op_count, array_idx);
 #endif
 
-        for (i = 0; i < ost_count; i++, array_idx=(array_idx + 1) % ost_count) {
-                ++lov->lov_start_idx;
-                ost_idx = lov->lov_qos.lq_rr_array[array_idx];
+        for (i = 0; i < osts->op_count;
+                    i++, array_idx=(array_idx + 1) % osts->op_count) {
+                ++lqr->lqr_start_idx;
+                ost_idx = lqr->lqr_pool.op_array[array_idx];
 #ifdef QOS_DEBUG
                 CDEBUG(D_QOS, "#%d strt %d act %d strp %d ary %d idx %d\n",
-                       i, lov->lov_start_idx,
+                       i, lqr->lqr_start_idx,
                        ((ost_idx != LOV_QOS_EMPTY) && lov->lov_tgts[ost_idx]) ?
                        lov->lov_tgts[ost_idx]->ltd_active : 0,
                        idx_pos - idx_arr, array_idx, ost_idx);
@@ -593,10 +606,13 @@ repeat_find:
         if ((speed < 2) && (idx_pos - idx_arr < stripe_cnt_min)) {
                 /* Try again, allowing slower OSCs */
                 speed++;
-                lov->lov_start_idx = ost_start_idx_temp;
+                lqr->lqr_start_idx = ost_start_idx_temp;
                 goto repeat_find;
         }
 
+        if (pool != NULL)
+                read_unlock(&pool_tgt_rwlock(pool));
+
         up_read(&lov->lov_qos.lq_rw_sem);
 
         *stripe_cnt = idx_pos - idx_arr;
@@ -607,15 +623,45 @@ repeat_find:
 static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm,
                           int *idx_arr)
 {
-        unsigned ost_idx, ost_count = lov->desc.ld_tgt_count;
+        unsigned ost_idx, array_idx, ost_count;
         int i, *idx_pos;
         int speed = 0;
+        struct pool_desc *pool = NULL;
+        struct ost_pool *osts;
         ENTRY;
 
+        pool = lov_find_pool(lov, lsm->lsm_pool_name);
+        if (pool == NULL) {
+                osts = &(lov->lov_packed);
+        } else {
+                read_lock(&pool_tgt_rwlock(pool));
+                osts = &(pool->pool_obds);
+        }
+
+        ost_count = osts->op_count;
+
 repeat_find:
-        ost_idx = lsm->lsm_oinfo[0]->loi_ost_idx;
+        /* search loi_ost_idx in ost array */
+        array_idx = 0;
+        for (i = 0; i < ost_count; i++) {
+                if (osts->op_array[i] == lsm->lsm_oinfo[0]->loi_ost_idx) {
+                        array_idx = i;
+                        break;
+                }
+        }
+        if (i == ost_count) {
+                if (pool != NULL)
+                        read_unlock(&pool_tgt_rwlock(pool));
+                CERROR("Start index %d not found in pool '%s'\n",
+                       lsm->lsm_oinfo[0]->loi_ost_idx, lsm->lsm_pool_name);
+                RETURN(-EINVAL);
+        }
+
         idx_pos = idx_arr;
-        for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) {
+        for (i = 0; i < ost_count;
+             i++, array_idx = (array_idx + 1) % ost_count) {
+                ost_idx = osts->op_array[array_idx];
+
                 if (!lov->lov_tgts[ost_idx] ||
                     !lov->lov_tgts[ost_idx]->ltd_active) {
                         continue;
@@ -634,8 +680,11 @@ repeat_find:
                 *idx_pos = ost_idx;
                 idx_pos++;
                 /* We have enough stripes */
-                if (idx_pos - idx_arr == lsm->lsm_stripe_count)
+                if (idx_pos - idx_arr == lsm->lsm_stripe_count) {
+                        if (pool != NULL)
+                                read_unlock(&pool_tgt_rwlock(pool));
                         RETURN(0);
+                }
         }
         if (speed < 2) {
                 /* Try again, allowing slower OSCs */
@@ -652,6 +701,10 @@ repeat_find:
         CERROR("can't lstripe objid "LPX64": have %d want %u\n",
                lsm->lsm_object_id, (int)(idx_pos - idx_arr),
                lsm->lsm_stripe_count);
+
+        if (pool != NULL)
+                read_unlock(&pool_tgt_rwlock(pool));
+
         RETURN(-EFBIG);
 }
 
@@ -660,20 +713,32 @@ repeat_find:
    - network resources (shared OSS's)
 */
 static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
-                     int flags)
+                     char *poolname, int flags)
 {
         struct lov_obd *lov = &exp->exp_obd->u.lov;
         static time_t last_warn = 0;
         time_t now = cfs_time_current_sec();
         __u64 total_bavail, total_weight = 0;
-        __u32 ost_count;
         int nfound, good_osts, i, warn = 0, rc = 0;
         int stripe_cnt_min = min_stripe_count(*stripe_cnt, flags);
+        struct pool_desc *pool;
+        struct ost_pool *osts;
+        struct lov_qos_rr *lqr;
         ENTRY;
 
         if (stripe_cnt_min < 1)
                 GOTO(out_nolock, rc = -EINVAL);
 
+        pool = lov_find_pool(lov, poolname);
+        if (pool == NULL) {
+                osts = &(lov->lov_packed);
+                lqr = &(lov->lov_qos.lq_rr);
+        } else {
+                read_lock(&pool_tgt_rwlock(pool));
+                osts = &(pool->pool_obds);
+                lqr = &(pool->pool_rr);
+        }
+
         lov_getref(exp->exp_obd);
 
         /* Detect -EAGAIN early, before expensive lock is taken. */
@@ -690,8 +755,6 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
         if (!lov->lov_qos.lq_dirty && lov->lov_qos.lq_same_space)
                 GOTO(out, rc = -EAGAIN);
 
-        ost_count = lov->desc.ld_tgt_count;
-
         if (lov->desc.ld_active_tgt_count < 2)
                 GOTO(out, rc = -EAGAIN);
 
@@ -705,24 +768,25 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
         if (cfs_time_sub(now, last_warn) > 60 * 30)
                 warn = 1;
         /* Find all the OSTs that are valid stripe candidates */
-        for (i = 0; i < ost_count; i++) {
+        for (i = 0; i < osts->op_count; i++) {
                 __u64 bavail;
 
-                if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active)
+                if (!lov->lov_tgts[osts->op_array[i]] ||
+                    !lov->lov_tgts[osts->op_array[i]]->ltd_active)
                         continue;
-                bavail = TGT_BAVAIL(i);
+                bavail = TGT_BAVAIL(osts->op_array[i]);
                 if (!bavail) {
                         if (warn) {
                                 CDEBUG(D_QOS, "no free space on %s\n",
-                                     obd_uuid2str(&lov->lov_tgts[i]->ltd_uuid));
+                                     obd_uuid2str(&lov->lov_tgts[osts->op_array[i]]->ltd_uuid));
                                 last_warn = now;
                         }
                         continue;
                 }
-                if (!TGT_FFREE(i)) {
+                if (!TGT_FFREE(osts->op_array[i])) {
                         if (warn) {
                                 CDEBUG(D_QOS, "no free inodes on %s\n",
-                                     obd_uuid2str(&lov->lov_tgts[i]->ltd_uuid));
+                                     obd_uuid2str(&lov->lov_tgts[osts->op_array[i]]->ltd_uuid));
                                 last_warn = now;
                         }
                         continue;
@@ -730,20 +794,24 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
 
                 /* Fail Check before osc_precreate() is called
                    so we can only 'fail' single OSC. */
-                if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && i == 0)
+                if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && osts->op_array[i] == 0)
                         continue;
 
-                if (obd_precreate(lov->lov_tgts[i]->ltd_exp) > 2)
+                if (obd_precreate(lov->lov_tgts[osts->op_array[i]]->ltd_exp) > 2)
                         continue;
 
-                lov->lov_tgts[i]->ltd_qos.ltq_usable = 1;
-                qos_calc_weight(lov, i);
+                lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_usable = 1;
+                qos_calc_weight(lov, osts->op_array[i]);
                 total_bavail += bavail;
-                total_weight += lov->lov_tgts[i]->ltd_qos.ltq_weight;
+                total_weight += lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_weight;
 
                 good_osts++;
         }
 
+#ifdef QOS_DEBUG
+        CDEBUG(D_QOS, "found %d good osts\n", good_osts);
+#endif
+
         if (good_osts < stripe_cnt_min)
                 GOTO(out, rc = -EAGAIN);
 
@@ -792,19 +860,24 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
 
                 /* On average, this will hit larger-weighted osts more often.
                    0-weight osts will always get used last (only when rand=0).*/
-                for (i = 0; i < ost_count; i++) {
-                        if (!lov->lov_tgts[i] ||
-                            !lov->lov_tgts[i]->ltd_qos.ltq_usable)
+                for (i = 0; i < osts->op_count; i++) {
+                        if (!lov->lov_tgts[osts->op_array[i]] ||
+                            !lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_usable)
                                 continue;
 
-                        cur_weight += lov->lov_tgts[i]->ltd_qos.ltq_weight;
+                        cur_weight += lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_weight;
+#ifdef QOS_DEBUG
+                        CDEBUG(D_QOS, "stripe_cnt=%d nfound=%d cur_weight="LPU64
+                                      " rand="LPU64" total_weight="LPU64"\n",
+                               *stripe_cnt, nfound, cur_weight, rand, total_weight);
+#endif
                         if (cur_weight >= rand) {
 #ifdef QOS_DEBUG
                                 CDEBUG(D_QOS, "assigned stripe=%d to idx=%d\n",
-                                       nfound, i);
+                                       nfound, osts->op_array[i]);
 #endif
-                                idx_arr[nfound++] = i;
-                                qos_used(lov, i, &total_weight);
+                                idx_arr[nfound++] = osts->op_array[i];
+                                qos_used(lov, osts, osts->op_array[i], &total_weight);
                                 rc = 0;
                                 break;
                         }
@@ -818,11 +891,14 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt,
         LASSERT(nfound == *stripe_cnt);
 
 out:
+        if (pool != NULL)
+                read_unlock(&pool_tgt_rwlock(pool));
+
         up_write(&lov->lov_qos.lq_rw_sem);
 
 out_nolock:
         if (rc == -EAGAIN)
-                rc = alloc_rr(lov, idx_arr, stripe_cnt, flags);
+                rc = alloc_rr(lov, idx_arr, stripe_cnt, poolname, flags);
 
         lov_putref(exp->exp_obd);
         RETURN(rc);
@@ -847,7 +923,8 @@ static int alloc_idx_array(struct obd_export *exp, struct lov_stripe_md *lsm,
 
         if (newea ||
             lsm->lsm_oinfo[0]->loi_ost_idx >= lov->desc.ld_tgt_count)
-                rc = alloc_qos(exp, tmp_arr, &stripe_cnt, flags);
+                rc = alloc_qos(exp, tmp_arr, &stripe_cnt,
+                               lsm->lsm_pool_name, flags);
         else
                 rc = alloc_specific(lov, lsm, tmp_arr);
 
index 182a8b7..d05c902 100644 (file)
@@ -265,12 +265,11 @@ static int mdd_lov_set_dir_md(const struct lu_env *env,
         LASSERT(S_ISDIR(mdd_object_type(obj)));
         lum = (struct lov_user_md*)buf->lb_buf;
 
-        /* if { size, offset, count } = { 0, -1, 0 } (i.e. all default
+        /* if { size, offset, count } = { 0, -1, 0 } and no pool (i.e. all default
          * values specified) then delete default striping from dir. */
-        if ((lum->lmm_stripe_size == 0 && lum->lmm_stripe_count == 0 &&
-             lum->lmm_stripe_offset == (typeof(lum->lmm_stripe_offset))(-1)) ||
-             /* lmm_stripe_size == -1 is deprecated in 1.4.6 */
-             lum->lmm_stripe_size == (typeof(lum->lmm_stripe_size))(-1)){
+        if (lum->lmm_stripe_size == 0 && lum->lmm_stripe_count == 0 &&
+            lum->lmm_stripe_offset == (typeof(lum->lmm_stripe_offset))(-1) &&
+            lum->lmm_magic != LOV_USER_MAGIC_V3) {
                 rc = mdd_xattr_set_txn(env, obj, &LU_BUF_NULL,
                                        MDS_LOV_MD_NAME, 0, handle);
                 if (rc == -ENODATA)
@@ -324,7 +323,7 @@ int mdd_lov_set_md(const struct lu_env *env, struct mdd_object *pobj,
                 if (lmmp == NULL && lmm_size == 0) {
                         struct mdd_device *mdd = mdd_obj2mdd_dev(child);
                         struct lov_mds_md *lmm = mdd_max_lmm_get(env, mdd);
-                        int size = sizeof(*lmm);
+                        int size = sizeof(struct lov_mds_md_v3);
 
                         /* Get parent dir stripe and set */
                         if (pobj != NULL)
@@ -362,15 +361,21 @@ static void mdd_lov_update_objids(struct obd_device *obd, struct lov_mds_md *lmm
 {
         struct mds_obd *mds = &obd->u.mds;
         int j;
+        struct lov_ost_data_v1 *lmm_objects;
         ENTRY;
 
         /* if we create file without objects - lmm is NULL */
         if (lmm == NULL)
                 return;
 
+        if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3)
+                lmm_objects = ((struct lov_mds_md_v3 *)lmm)->lmm_objects;
+        else
+                lmm_objects = lmm->lmm_objects;
+
         for (j = 0; j < le32_to_cpu(lmm->lmm_stripe_count); j++) {
-                int i = le32_to_cpu(lmm->lmm_objects[j].l_ost_idx);
-                obd_id id = le64_to_cpu(lmm->lmm_objects[j].l_object_id);
+                int i = le32_to_cpu(lmm_objects[j].l_ost_idx);
+                obd_id id = le64_to_cpu(lmm_objects[j].l_object_id);
                 int page = i / OBJID_PER_PAGE();
                 int idx = i % OBJID_PER_PAGE();
                 obd_id *data = mds->mds_lov_page_array[page];
index 743762f..01ab561 100644 (file)
@@ -121,7 +121,10 @@ int mdd_log_txn_param_build(const struct lu_env *env, struct md_object *obj,
         if (rc || !(ma->ma_valid & MA_LOV))
                 RETURN(rc);
 
-        LASSERT(le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC);
+        LASSERTF(le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V1 ||
+                 le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V3,
+                 "%08x", le32_to_cpu(ma->ma_lmm->lmm_magic));
+
         if ((int)le32_to_cpu(ma->ma_lmm->lmm_stripe_count) < 0)
                 stripe = mdd2obd_dev(mdd)->u.mds.mds_lov_desc.ld_tgt_count;
         else
index 2c37cc5..ee256b2 100644 (file)
@@ -410,7 +410,7 @@ static int mds_cmd_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         if (rc)
                 GOTO(err_objects, rc);
 
-        mds->mds_max_mdsize = sizeof(struct lov_mds_md);
+        mds->mds_max_mdsize = sizeof(struct lov_mds_md_v3);
         mds->mds_max_cookiesize = sizeof(struct llog_cookie);
 
 err_pop:
index 4d30fe3..9cf0e71 100644 (file)
@@ -64,7 +64,8 @@ int mds_post_mds_lovconf(struct obd_device *obd);
 int mds_notify(struct obd_device *obd, struct obd_device *watched,
                enum obd_notify_event ev, void *data);
 int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode,
-                       struct lov_mds_md *lmm, int lmm_size);
+                       struct lov_mds_md *lmm, int lmm_size,
+                       __u64 connect_flags);
 int mds_init_lov_desc(struct obd_device *obd, struct obd_export *osc_exp);
 
 int mds_obd_create(struct obd_export *exp, struct obdo *oa,
index 1f0f995..07444ac 100644 (file)
@@ -367,7 +367,7 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov)
         stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT,
                                mds->mds_lov_desc.ld_tgt_count);
 
-        mds->mds_max_mdsize = lov_mds_md_size(stripes);
+        mds->mds_max_mdsize = lov_mds_md_size(stripes, LOV_MAGIC_V3);
         mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie);
         CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize for %d stripes: "
                "%d/%d\n", mds->mds_max_mdsize, mds->mds_max_cookiesize,
index 3208572..1bd26d6 100644 (file)
@@ -357,6 +357,21 @@ static int mgs_put_cfg_lock(struct lustre_handle *lockh)
         RETURN(0);
 }
 
+static void mgs_revoke_lock(struct obd_device *obd, char *fsname,
+                            struct lustre_handle *lockh)
+{
+        int lockrc;
+
+        if (fsname[0]) {
+                lockrc = mgs_get_cfg_lock(obd, fsname, lockh);
+                if (lockrc != ELDLM_OK)
+                        CERROR("lock error %d for fs %s\n", lockrc,
+                               fsname);
+                else
+                        mgs_put_cfg_lock(lockh);
+        }
+}
+
 /* rc=0 means ok
       1 means update
      <0 means error */
@@ -508,7 +523,7 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req)
         struct obd_device *obd = req->rq_export->exp_obd;
         struct mgs_send_param *msp, *rep_msp;
         struct lustre_handle lockh;
-        int lockrc, rc;
+        int rc;
         struct lustre_cfg_bufs bufs;
         struct lustre_cfg *lcfg;
         char fsname[MTI_NAME_MAXLEN];
@@ -528,19 +543,9 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req)
                 RETURN(rc);
         }
 
-        /* Revoke lock so everyone updates.  Should be alright if
-         * someone was already reading while we were updating the logs,
-         * so we don't really need to hold the lock while we're
-         * writing.
-         */
-        if (fsname[0]) {
-                lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
-                if (lockrc != ELDLM_OK)
-                        CERROR("lock error %d for fs %s\n", lockrc,
-                               fsname);
-                else
-                        mgs_put_cfg_lock(&lockh);
-        }
+        /* request for update */
+        mgs_revoke_lock(obd, fsname, &lockh);
+
         lustre_cfg_free(lcfg);
 
         rc = req_capsule_server_pack(&req->rq_pill);
@@ -709,6 +714,134 @@ static inline int mgs_destroy_export(struct obd_export *exp)
         RETURN(0);
 }
 
+static int mgs_extract_fs_pool(char * arg, char *fsname, char *poolname)
+{
+        char *ptr;
+
+        ENTRY;
+        for (ptr = arg;  (*ptr != '\0') && (*ptr != '.'); ptr++ ) {
+                *fsname = *ptr;
+                fsname++;
+        }
+        if (*ptr == '\0')
+                return -EINVAL;
+        *fsname = '\0';
+        ptr++;
+        strcpy(poolname, ptr);
+
+        RETURN(0);
+}
+
+static int mgs_iocontrol_pool(struct obd_device *obd, 
+                              struct obd_ioctl_data *data)
+{
+        int rc;
+        struct lustre_handle lockh;
+        struct lustre_cfg *lcfg = NULL;
+        struct llog_rec_hdr rec;
+        char *fsname = NULL;
+        char *poolname = NULL;
+        ENTRY;
+
+        OBD_ALLOC(fsname, MTI_NAME_MAXLEN);
+        if (fsname == NULL)
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC(poolname, MAXPOOLNAME + 1);
+        if (poolname == NULL) {
+                rc = -ENOMEM;
+                GOTO(out_pool, rc);
+        }
+        rec.lrh_len = llog_data_len(data->ioc_plen1);
+
+        if (data->ioc_type == LUSTRE_CFG_TYPE) {
+                rec.lrh_type = OBD_CFG_REC;
+        } else {
+                CERROR("unknown cfg record type:%d \n", data->ioc_type);
+                rc = -EINVAL;
+                GOTO(out_pool, rc);
+        }
+
+        if (data->ioc_plen1 > CFS_PAGE_SIZE) {
+                rc = -E2BIG;
+                GOTO(out_pool, rc);
+        }
+
+        OBD_ALLOC(lcfg, data->ioc_plen1);
+        if (lcfg == NULL) {
+                rc = -ENOMEM;
+                GOTO(out_pool, rc);
+        }
+        rc = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1);
+        if (rc)
+                GOTO(out_pool, rc);
+
+        if (lcfg->lcfg_bufcount < 2) {
+                rc = -EINVAL;
+                GOTO(out_pool, rc);
+        }
+
+        /* first arg is always <fsname>.<poolname> */
+        mgs_extract_fs_pool(lustre_cfg_string(lcfg, 1), fsname,
+                            poolname);
+
+        switch (lcfg->lcfg_command) {
+        case LCFG_POOL_NEW: {
+                if (lcfg->lcfg_bufcount != 2)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_NEW, fsname,
+                                  poolname, NULL);
+                break;
+        }
+        case LCFG_POOL_ADD: {
+                if (lcfg->lcfg_bufcount != 3)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_ADD, fsname, poolname,
+                                  lustre_cfg_string(lcfg, 2));
+                break;
+        }
+        case LCFG_POOL_REM: {
+                if (lcfg->lcfg_bufcount != 3)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_REM, fsname, poolname,
+                                  lustre_cfg_string(lcfg, 2));
+                break;
+        }
+        case LCFG_POOL_DEL: {
+                if (lcfg->lcfg_bufcount != 2)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_DEL, fsname,
+                                  poolname, NULL);
+                break;
+        }
+        default: {
+                 rc = -EINVAL;
+                 GOTO(out_pool, rc);
+        }
+        }
+
+        if (rc) {
+                CERROR("OBD_IOC_POOL err %d, cmd %X for pool %s.%s\n",
+                       rc, lcfg->lcfg_command, fsname, poolname);
+                GOTO(out_pool, rc);
+        }
+
+        /* request for update */
+        mgs_revoke_lock(obd, fsname, &lockh);
+
+out_pool:
+        if (lcfg != NULL)
+                OBD_FREE(lcfg, data->ioc_plen1);
+
+        if (fsname != NULL)
+                OBD_FREE(fsname, MTI_NAME_MAXLEN);
+
+        if (poolname != NULL)
+                OBD_FREE(poolname, MAXPOOLNAME + 1);
+
+        RETURN(rc);
+}
+
 /* from mdt_iocontrol */
 int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                   void *karg, void *uarg)
@@ -728,7 +861,6 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 struct lustre_cfg *lcfg;
                 struct llog_rec_hdr rec;
                 char fsname[MTI_NAME_MAXLEN];
-                int lockrc;
 
                 rec.lrh_len = llog_data_len(data->ioc_plen1);
 
@@ -759,20 +891,17 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                    someone was already reading while we were updating the logs,
                    so we don't really need to hold the lock while we're
                    writing (above). */
-                if (fsname[0]) {
-                        lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
-                        if (lockrc != ELDLM_OK)
-                                CERROR("lock error %d for fs %s\n", lockrc,
-                                       fsname);
-                        else
-                                mgs_put_cfg_lock(&lockh);
-                }
+                mgs_revoke_lock(obd, fsname, &lockh);
 
 out_free:
                 OBD_FREE(lcfg, data->ioc_plen1);
                 RETURN(rc);
         }
 
+        case OBD_IOC_POOL: {
+                RETURN(mgs_iocontrol_pool(obd, data));
+        }
+
         case OBD_IOC_DUMP_LOG: {
                 struct llog_ctxt *ctxt;
                 ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
index c0620a1..d5d18bd 100644 (file)
@@ -95,6 +95,9 @@ int mgs_erase_log(struct obd_device *obd, char *name);
 int mgs_erase_logs(struct obd_device *obd, char *fsname);
 int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname);
 
+int mgs_pool_cmd(struct obd_device *obd, enum lcfg_command_type cmd,
+                 char *poolname, char *fsname, char *ostname);
+
 /* mgs_fs.c */
 int mgs_fs_setup(struct obd_device *obd, struct vfsmount *mnt);
 int mgs_fs_cleanup(struct obd_device *obddev);
index 8c3f77d..b621ca6 100644 (file)
@@ -3235,6 +3235,145 @@ out:
         RETURN(rc);
 }
 
+static int mgs_write_log_pool(struct obd_device *obd, char *logname, struct fs_db *fsdb,
+                       char *lovname,
+                       enum lcfg_command_type cmd,
+                       char *poolname, char *fsname,
+                       char *ostname, char *comment)
+{
+        struct llog_handle *llh = NULL;
+        int rc;
+
+        rc = record_start_log(obd, &llh, logname);
+        if (rc)
+                RETURN(rc);
+        rc = record_marker(obd, llh, fsdb, CM_START, lovname, comment);
+        record_base(obd, llh, lovname, 0, cmd, poolname, fsname, ostname, 0);
+        rc = record_marker(obd, llh, fsdb, CM_END, lovname, comment);
+        rc = record_end_log(obd, &llh);
+
+        return(rc);
+}
+
+int mgs_pool_cmd(struct obd_device *obd, enum lcfg_command_type cmd,
+                 char *fsname, char *poolname, char *ostname)
+{
+        struct fs_db *fsdb;
+        char mdt_index[16];
+        char *lovname;
+        char *logname;
+        char *label, *canceled_label = NULL;
+        int label_sz;
+        struct mgs_target_info *mti;
+        int rc;
+        ENTRY;
+
+        rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb);
+        if (rc) {
+                CERROR("Can't get db for %s\n", fsname);
+                RETURN(rc);
+        }
+        if (fsdb->fsdb_flags & FSDB_LOG_EMPTY) {
+                CERROR("%s is not defined\n", fsname);
+                mgs_free_fsdb(obd, fsdb);
+                RETURN(-EINVAL);
+        }
+
+        label_sz = 10 + strlen(fsname) + strlen(poolname);
+
+        /* check if ostname match fsname */
+        if (ostname != NULL) {
+                char *ptr;
+
+                ptr = strrchr(ostname, '-');
+                if ((ptr == NULL) ||
+                    (strncmp(fsname, ostname, ptr-ostname) != 0))
+                        RETURN(-EINVAL);
+                label_sz += strlen(ostname);
+        }
+
+        OBD_ALLOC(label, label_sz);
+        if (label == NULL)
+                RETURN(-ENOMEM);
+
+        switch(cmd) {
+        case LCFG_POOL_NEW: {
+                sprintf(label,
+                        "new %s.%s", fsname, poolname);
+                break;
+        }
+        case LCFG_POOL_ADD: {
+                sprintf(label,
+                        "add %s.%s.%s", fsname, poolname, ostname);
+                break;
+        }
+        case LCFG_POOL_REM: {
+                OBD_ALLOC(canceled_label, label_sz);
+                if (canceled_label == NULL)
+                         RETURN(-ENOMEM);
+                sprintf(label,
+                        "rem %s.%s.%s", fsname, poolname, ostname);
+                sprintf(canceled_label,
+                        "add %s.%s.%s", fsname, poolname, ostname);
+                break;
+        }
+        case LCFG_POOL_DEL: {
+                OBD_ALLOC(canceled_label, label_sz);
+                if (canceled_label == NULL)
+                         RETURN(-ENOMEM);
+                sprintf(label,
+                        "del %s.%s", fsname, poolname);
+                sprintf(canceled_label,
+                        "new %s.%s", fsname, poolname);
+                break;
+        }
+        default: {
+                break;
+        }
+        }
+
+        down(&fsdb->fsdb_sem);
+
+        sprintf(mdt_index, "-MDT%04x", 0);
+        name_create(&logname, fsname, mdt_index);
+        name_create(&lovname, logname, "-mdtlov");
+
+        mti = NULL;
+        if (canceled_label != NULL) {
+                OBD_ALLOC(mti, sizeof(*mti));
+                if (mti != NULL) {
+                        strcpy(mti->mti_svname, "lov pool");
+                        mgs_modify(obd, fsdb, mti, logname, lovname,
+                                   canceled_label, CM_SKIP);
+                }
+        }
+
+        mgs_write_log_pool(obd, logname, fsdb, lovname,
+                           cmd, fsname, poolname, ostname, label);
+        name_destroy(&logname);
+
+        name_create(&logname, fsname, "-client");
+        if (canceled_label != NULL) {
+                mgs_modify(obd, fsdb, mti, logname, lovname,
+                           canceled_label, CM_SKIP);
+        }
+        mgs_write_log_pool(obd, logname, fsdb, fsdb->fsdb_clilov,
+                           cmd, fsname, poolname, ostname, label);
+        name_destroy(&logname);
+        name_destroy(&lovname);
+
+        up(&fsdb->fsdb_sem);
+
+        OBD_FREE(label, label_sz);
+        if (canceled_label != NULL)
+                OBD_FREE(canceled_label, label_sz);
+
+        if (mti != NULL)
+                OBD_FREE(mti, sizeof(*mti));
+
+        RETURN(rc);
+}
+
 #if 0
 /******************** unused *********************/
 static int mgs_backup_llog(struct obd_device *obd, char* fsname)
index 1f4d99b..ed5faa3 100644 (file)
@@ -117,9 +117,10 @@ int dump_obdo(struct obdo *oa)
 void dump_lsm(int level, struct lov_stripe_md *lsm)
 {
         CDEBUG(level, "lsm %p, objid "LPX64", maxbytes "LPX64", magic 0x%08X, "
-               "stripe_size %u, stripe_count %u\n", lsm,
+               "stripe_size %u, stripe_count %u pool "POOLNAMEF"\n", lsm,
                lsm->lsm_object_id, lsm->lsm_maxbytes, lsm->lsm_magic,
-               lsm->lsm_stripe_size, lsm->lsm_stripe_count);
+               lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+               lsm->lsm_pool_name);
 }
 
 /* XXX assumes only a single page in request */
index a0e0912..79eb987 100644 (file)
@@ -111,28 +111,35 @@ static int lprocfs_obd_snprintf(char **page, int end, int *len,
         return n;
 }
 
-int lprocfs_add_simple(struct proc_dir_entry *root, char *name,
-                       read_proc_t *read_proc, write_proc_t *write_proc,
-                       void *data)
-{
-        struct proc_dir_entry *proc;
+cfs_proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+                                         char *name,
+                                         read_proc_t *read_proc,
+                                         write_proc_t *write_proc,
+                                         void *data,
+                                         struct file_operations *fops)
+{
+        cfs_proc_dir_entry_t *proc;
         mode_t mode = 0;
 
         if (root == NULL || name == NULL)
-                return -EINVAL;
+                return ERR_PTR(-EINVAL);
         if (read_proc)
                 mode = 0444;
         if (write_proc)
                 mode |= 0200;
+        if (fops)
+                mode = 0644;
         proc = create_proc_entry(name, mode, root);
         if (!proc) {
                 CERROR("LprocFS: No memory to create /proc entry %s", name);
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
         }
         proc->read_proc = read_proc;
         proc->write_proc = write_proc;
         proc->data = data;
-        return 0;
+        if (fops)
+                proc->proc_fops = fops;
+        return proc;
 }
 
 struct proc_dir_entry *lprocfs_add_symlink(const char *name,
@@ -730,6 +737,8 @@ static const char *obd_connect_names[] = {
         "change_qunit_size",
         "alt_checksum_algorithm",
         "fid_is_enabled",
+        "version_recovery",
+        "pools",
         NULL
 };
 
@@ -1207,6 +1216,10 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
         LPROCFS_OBD_OP_INIT(num_private_stats,stats,unregister_page_removal_cb);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats, register_lock_cancel_cb);
         LPROCFS_OBD_OP_INIT(num_private_stats, stats,unregister_lock_cancel_cb);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+        LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
 }
 
 int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
@@ -1488,6 +1501,7 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
         int rc = 0;
         struct nid_stat *tmp = NULL, *tmp1;
         struct obd_device *obd = NULL;
+        cfs_proc_dir_entry_t *entry;
         ENTRY;
 
         *newnid = 0;
@@ -1538,15 +1552,19 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
                 GOTO(destroy_new, rc = -ENOMEM);
         }
 
-        rc = lprocfs_add_simple(tmp->nid_proc, "uuid",
-                                lprocfs_exp_rd_uuid, NULL, tmp);
-        if (rc)
+        entry = lprocfs_add_simple(tmp->nid_proc, "uuid",
+                                   lprocfs_exp_rd_uuid, NULL, tmp, NULL);
+        if (IS_ERR(entry)) {
                 CWARN("Error adding the uuid file\n");
+                rc = PTR_ERR(entry);
+        }
 
-        rc = lprocfs_add_simple(tmp->nid_proc, "hash",
-                                lprocfs_exp_rd_hash, NULL, tmp);
-        if (rc)
+        entry = lprocfs_add_simple(tmp->nid_proc, "hash",
+                                lprocfs_exp_rd_hash, NULL, tmp, NULL);
+        if (IS_ERR(entry)) {
                 CWARN("Error adding the hash file\n");
+                rc = PTR_ERR(entry);
+        }
 
         exp->exp_nid_stats = tmp;
         *newnid = 1;
index fbc8a8d..d375ab7 100644 (file)
@@ -825,6 +825,28 @@ int class_process_config(struct lustre_cfg *lcfg)
                 err = class_del_conn(obd, lcfg);
                 GOTO(out, err = 0);
         }
+        case LCFG_POOL_NEW: {
+                err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+                GOTO(out, err = 0);
+                break;
+        }
+        case LCFG_POOL_ADD: {
+                err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+                                   lustre_cfg_string(lcfg, 3));
+                GOTO(out, err = 0);
+                break;
+        }
+        case LCFG_POOL_REM: {
+                err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+                                   lustre_cfg_string(lcfg, 3));
+                GOTO(out, err = 0);
+                break;
+        }
+        case LCFG_POOL_DEL: {
+                err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+                GOTO(out, err = 0);
+                break;
+        }
         default: {
                 err = obd_process_config(obd, sizeof(*lcfg), lcfg);
                 GOTO(out, err);
index 49f209e..3c7c88f 100644 (file)
@@ -2151,7 +2151,7 @@ static int filter_setup(struct obd_device *obd, struct lustre_cfg* lcfg)
         if (obd->obd_proc_exports_entry)
                 lprocfs_add_simple(obd->obd_proc_exports_entry, "clear",
                                    lprocfs_nid_stats_clear_read,
-                                   lprocfs_nid_stats_clear_write, obd);
+                                   lprocfs_nid_stats_clear_write, obd, NULL);
 
         memcpy((void *)addr, lustre_cfg_buf(lcfg, 4),
                LUSTRE_CFG_BUFLEN(lcfg, 4));
index b79cfd4..8e1e2d5 100644 (file)
@@ -3466,29 +3466,45 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
  */
 static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump)
 {
-        struct lov_user_md lum, *lumk;
+        /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */
+        struct lov_user_md_v3 lum, *lumk;
+        struct lov_user_ost_data_v1 *lmm_objects;
         int rc = 0, lum_size;
         ENTRY;
 
         if (!lsm)
                 RETURN(-ENODATA);
 
-        if (copy_from_user(&lum, lump, sizeof(lum)))
+        /* we only need the header part from user space to get lmm_magic and
+         * lmm_stripe_count, (the header part is common to v1 and v3) */
+        lum_size = sizeof(struct lov_user_md_v1);
+        if (copy_from_user(&lum, lump, lum_size))
                 RETURN(-EFAULT);
 
-        if (lum.lmm_magic != LOV_USER_MAGIC)
+        if ((lum.lmm_magic != LOV_USER_MAGIC_V1) &&
+            (lum.lmm_magic != LOV_USER_MAGIC_V3))
                 RETURN(-EINVAL);
 
+        /* lov_user_md_vX and lov_mds_md_vX must have the same size */
+        LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1));
+        LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3));
+        LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0]));
+
+        /* we can use lov_mds_md_size() to compute lum_size
+         * because lov_user_md_vX and lov_mds_md_vX have the same size */
         if (lum.lmm_stripe_count > 0) {
-                lum_size = sizeof(lum) + sizeof(lum.lmm_objects[0]);
+                lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic);
                 OBD_ALLOC(lumk, lum_size);
                 if (!lumk)
                         RETURN(-ENOMEM);
 
-                lumk->lmm_objects[0].l_object_id = lsm->lsm_object_id;
-                lumk->lmm_objects[0].l_object_gr = lsm->lsm_object_gr;
+                if (lum.lmm_magic == LOV_USER_MAGIC_V1)
+                        lmm_objects = &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]);
+                else
+                        lmm_objects = &(lumk->lmm_objects[0]);
+                lmm_objects->l_object_id = lsm->lsm_object_id;
         } else {
-                lum_size = sizeof(lum);
+                lum_size = lov_mds_md_size(0, lum.lmm_magic);
                 lumk = &lum;
         }
 
index 7df1357..97ea280 100644 (file)
@@ -1955,10 +1955,9 @@ static void print_lum (struct lov_user_md *lum)
         CDEBUG(D_OTHER, "\tlmm_stripe_offset: %#x\n", lum->lmm_stripe_offset);
 }
 
-void lustre_swab_lov_user_md(struct lov_user_md *lum)
+static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum)
 {
         ENTRY;
-        CDEBUG(D_IOCTL, "swabbing lov_user_md\n");
         __swab32s(&lum->lmm_magic);
         __swab32s(&lum->lmm_pattern);
         __swab64s(&lum->lmm_object_id);
@@ -1982,6 +1981,23 @@ static void print_lumj (struct lov_user_md_join *lumj)
         CDEBUG(D_OTHER, "\tlmm_extent_count: %#x\n", lumj->lmm_extent_count);
 }
 
+void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum)
+{
+        ENTRY;
+        CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n");
+        lustre_swab_lov_user_md_common(lum);
+        EXIT;
+}
+
+void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum)
+{
+        ENTRY;
+        CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n");
+        lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum);
+        /* lmm_pool_name nothing to do with char */
+        EXIT;
+}
+
 void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj)
 {
         ENTRY;
@@ -1997,63 +2013,20 @@ void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj)
         EXIT;
 }
 
-static void print_lum_objs(struct lov_user_md *lum)
+void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod,
+                                     int stripe_count)
 {
-        struct lov_user_ost_data *lod;
         int i;
         ENTRY;
-        if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
-                return;
-        CDEBUG(D_OTHER, "lov_user_md_objects: %p\n", lum);
-        for (i = 0; i < lum->lmm_stripe_count; i++) {
-                lod = &lum->lmm_objects[i];
-                CDEBUG(D_OTHER, "(%i) lod->l_object_id: "LPX64"\n", i, lod->l_object_id);
-                CDEBUG(D_OTHER, "(%i) lod->l_object_gr: "LPX64"\n", i, lod->l_object_gr);
-                CDEBUG(D_OTHER, "(%i) lod->l_ost_gen: %#x\n", i, lod->l_ost_gen);
-                CDEBUG(D_OTHER, "(%i) lod->l_ost_idx: %#x\n", i, lod->l_ost_idx);
+        for (i = 0; i < stripe_count; i++) {
+                __swab64s(&(lod[i].l_object_id));
+                __swab64s(&(lod[i].l_object_gr));
+                __swab32s(&(lod[i].l_ost_gen));
+                __swab32s(&(lod[i].l_ost_idx));
         }
         EXIT;
 }
 
-void lustre_swab_lov_user_md_objects(struct lov_user_md *lum)
-{
-        struct lov_user_ost_data *lod;
-        int i;
-        ENTRY;
-        for (i = 0; i < lum->lmm_stripe_count; i++) {
-                lod = &lum->lmm_objects[i];
-                __swab64s(&lod->l_object_id);
-                __swab64s(&lod->l_object_gr);
-                __swab32s(&lod->l_ost_gen);
-                __swab32s(&lod->l_ost_idx);
-        }
-        print_lum_objs(lum);
-        EXIT;
-}
-
-
-void lustre_swab_lov_mds_md(struct lov_mds_md *lmm)
-{
-        struct lov_ost_data *lod;
-        int i;
-        ENTRY;
-        for (i = 0; i < lmm->lmm_stripe_count; i++) {
-                lod = &lmm->lmm_objects[i];
-                __swab64s(&lod->l_object_id);
-                __swab64s(&lod->l_object_gr);
-                __swab32s(&lod->l_ost_gen);
-                __swab32s(&lod->l_ost_idx);
-        }
-        __swab32s(&lmm->lmm_magic);
-        __swab32s(&lmm->lmm_pattern);
-        __swab64s(&lmm->lmm_object_id);
-        __swab64s(&lmm->lmm_object_gr);
-        __swab32s(&lmm->lmm_stripe_size);
-        __swab32s(&lmm->lmm_stripe_count);
-
-        EXIT;
-}
-
 
 void lustre_swab_ldlm_res_id (struct ldlm_res_id *id)
 {
index 5d7d35e..76e0727 100644 (file)
@@ -252,8 +252,8 @@ EXPORT_SYMBOL(lustre_swab_mds_rec_unlink);
 EXPORT_SYMBOL(lustre_swab_mds_rec_rename);
 EXPORT_SYMBOL(lustre_swab_mdt_rec_reint);
 EXPORT_SYMBOL(lustre_swab_lov_desc);
-EXPORT_SYMBOL(lustre_swab_lov_user_md);
-EXPORT_SYMBOL(lustre_swab_lov_mds_md);
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v1);
+EXPORT_SYMBOL(lustre_swab_lov_user_md_v3);
 EXPORT_SYMBOL(lustre_swab_lov_user_md_objects);
 EXPORT_SYMBOL(lustre_swab_lov_user_md_join);
 EXPORT_SYMBOL(lustre_swab_ldlm_res_id);
index 2c92091..8239298 100644 (file)
@@ -240,7 +240,7 @@ int main(int argc, char **argv)
                 return rc;
         }
 
-        lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT);
+        lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC);
         if ((lum_dir = (struct lov_user_md *)malloc(lum_size)) == NULL) {
                 rc = ENOMEM;
                 llapi_err(LLAPI_MSG_ERROR, "error: can't allocate %d bytes "
index 4b61102..ca5093c 100644 (file)
@@ -64,7 +64,7 @@ int main(int argc, char** argv)
                 return 1;
         }
 
-        lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT);
+        lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC);
 
         if ((lum_file = (struct lov_user_md *)malloc(lum_size)) == NULL) {
                 fprintf(stderr, "unable to allocate memory for ioctl's");
index b3024aa..82217bc 100755 (executable)
@@ -7,5 +7,6 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
 
+[ -n "$LOAD" ] && load_modules && exit 0
 [ -z "$NOFORMAT" ] && formatall
 [ -z "$NOSETUP" ] && setupall
index 7cd617f..8232094 100644 (file)
@@ -3540,6 +3540,7 @@ test_99a() {
 run_test 99a "cvs init ========================================="
 
 test_99b() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
        [ ! -d $DIR/d99cvsroot ] && test_99a
        cd /etc/init.d
        # some versions of cvs import exit(1) when asked to import links or
@@ -3552,6 +3553,7 @@ test_99b() {
 run_test 99b "cvs import ======================================="
 
 test_99c() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
        [ ! -d $DIR/d99cvsroot ] && test_99b
        cd $DIR
        mkdir -p $DIR/d99reposname
@@ -3561,6 +3563,7 @@ test_99c() {
 run_test 99c "cvs checkout ====================================="
 
 test_99d() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
        [ ! -d $DIR/d99cvsroot ] && test_99c
        cd $DIR/d99reposname
        $RUNAS touch foo99
@@ -3569,6 +3572,7 @@ test_99d() {
 run_test 99d "cvs add =========================================="
 
 test_99e() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
        [ ! -d $DIR/d99cvsroot ] && test_99c
        cd $DIR/d99reposname
        $RUNAS cvs update
@@ -3576,6 +3580,7 @@ test_99e() {
 run_test 99e "cvs update ======================================="
 
 test_99f() {
+        [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return
        [ ! -d $DIR/d99cvsroot ] && test_99d
        cd $DIR/d99reposname
        $RUNAS cvs commit -m 'nomsg' foo99
@@ -4955,8 +4960,10 @@ test_121() { #bug #10589
 run_test 121 "read cancel race ========="
 
 test_123a() { # was test 123, statahead(bug 11401)
+        SLOWOK=0
         if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
                 log "testing on UP system. Performance may be not as good as expected."
+               SLOWOK=1
         fi
 
         remount_client $MOUNT
@@ -5011,7 +5018,7 @@ test_123a() { # was test 123, statahead(bug 11401)
         lctl get_param -n llite.*.statahead_stats
         # wait for commitment of removal
         sleep 2
-        [ $error -ne 0 ] && error "statahead is slow!"
+        [ $error -ne 0 -a $SLOWOK -eq 0 ] && error "statahead is slow!"
         return 0
 }
 run_test 123a "verify statahead work"
@@ -5496,6 +5503,123 @@ test_130e() {
 }
 run_test 130e "FIEMAP (test continuation FIEMAP calls)"
 
+POOL=${POOL:-cea1}
+TGT_COUNT=$OSTCOUNT
+TGTPOOL_FIRST=1
+TGTPOOL_MAX=$(($TGT_COUNT - 1))
+TGTPOOL_STEP=2
+TGTPOOL_LIST=`seq $TGTPOOL_FIRST $TGTPOOL_STEP $TGTPOOL_MAX`
+POOL_ROOT=${POOL_ROOT:-$DIR/d200.pools}
+POOL_DIR=$POOL_ROOT/dir_tst
+POOL_FILE=$POOL_ROOT/file_tst
+
+check_file_in_pool()
+{
+       file=$1
+       res=$($GETSTRIPE $file | grep 0x | cut -f2)
+       for i in $res
+       do
+               found=$(echo :$TGTPOOL_LIST: | tr " " ":"  | grep :$i:)
+               if [[ "$found" == "" ]]
+               then
+                       echo "pool list: $TGTPOOL_LIST"
+                       echo "striping: $res"
+                       error "$file not allocated in $POOL"
+                       return 1
+               fi
+       done
+       return 0
+}
+
+test_200() {
+       do_facet mgs $LCTL pool_new $FSNAME.$POOL
+       do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL
+       [ $? == 0 ] || error "Pool creation of $POOL failed"
+}
+run_test 200 "Create new pool =========================================="
+
+test_201() {
+       TGT=$(seq -f $FSNAME-OST%04g_UUID $TGTPOOL_FIRST $TGTPOOL_STEP \
+               $TGTPOOL_MAX | tr '\n' ' ')
+       do_facet mgs $LCTL pool_add $FSNAME.$POOL \
+               $FSNAME-OST[$TGTPOOL_FIRST-$TGTPOOL_MAX/$TGTPOOL_STEP]_UUID
+       res=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL | sort \
+                       | tr '\n' ' ')
+       [ "$res" = "$TGT" ] || error "Pool content ($res) do not match requested ($TGT)"
+}
+run_test 201 "Add targets to a pool ===================================="
+
+test_202a() {
+       mkdir -p $POOL_DIR
+       $SETSTRIPE -c 2 -p $POOL $POOL_DIR
+       [ $? = 0 ] || error "Cannot set pool $POOL to $POOL_DIR"
+}
+run_test 202a "Set pool on a directory ================================="
+
+test_202b() {
+       res=$($GETSTRIPE $POOL_DIR | grep pool: | cut -f8 -d " ")
+       [ "$res" = $POOL ] || error "Pool on $POOL_DIR is not $POOL"
+}
+run_test 202b "Check pool on a directory ==============================="
+
+test_202c() {
+       failed=0
+       for i in $(seq -w 1 $(($TGT_COUNT * 3)))
+       do
+               file=$POOL_DIR/file-$i
+               touch $file
+               check_file_in_pool $file
+               if [[ $? != 0 ]]
+               then
+                       failed=$(($failed + 1))
+               fi
+       done
+       [ "$failed" = 0 ] || error "$failed files not allocated in $POOL"
+}
+run_test 202c "Check files allocation from directory pool =============="
+
+test_203() {
+       mkdir -p $POOL_FILE
+       failed=0
+       for i in $(seq -w 1 $(($TGT_COUNT * 3)))
+       do
+               file=$POOL_FILE/spoo-$i
+               $SETSTRIPE -p $POOL $file
+               check_file_in_pool $file
+               if [[ $? != 0 ]]
+               then
+                       failed=$(($failed + 1))
+               fi
+       done
+       [ "$failed" = 0 ] || error "$failed files not allocated in $POOL"
+}
+run_test 203 "Create files in a pool ==================================="
+
+test_210a() {
+       TGT=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL | head -1)
+       do_facet mgs $LCTL pool_remove $FSNAME.$POOL $TGT
+       res=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL | grep $TGT)
+       [ "$res" = "" ] || error "$TGT not removed from $FSNAME.$POOL"
+}
+run_test 210a "Remove a target from a pool ============================="
+
+test_210b() {
+       for TGT in $(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL)
+       do
+               do_facet mgs $LCTL pool_remove $FSNAME.$POOL $TGT
+       done
+       res=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL)
+       [ "$res" = "" ] || error "Pool $FSNAME.$POOL cannot be drained"
+}
+run_test 210b "Remove all targets from a pool =========================="
+
+test_211() {
+       do_facet mgs $LCTL pool_destroy $FSNAME.$POOL
+       res=$(do_facet mgs "$LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL 2>/dev/null")
+       [ "$res" = "" ] || error "Pool $FSNAME.$POOL is not destroyed"
+}
+run_test 211 "Remove a pool ============================================"
+
 TMPDIR=$OLDTMPDIR
 TMP=$OLDTMP
 HOME=$OLDHOME
index b692fe5..229dbf6 100644 (file)
@@ -83,7 +83,7 @@ init_test_env() {
     export LUSTRE=`absolute_path $LUSTRE`
     export TESTSUITE=`basename $0 .sh`
 
-    [ -d /r ] && export ROOT=${ROOT:-/r}
+    #[ -d /r ] && export ROOT=${ROOT:-/r}
     export TMP=${TMP:-$ROOT/tmp}
     export TESTSUITELOG=${TMP}/${TESTSUITE}.log
     export HOSTNAME=${HOSTNAME:-`hostname`}
@@ -506,7 +506,7 @@ zconf_mount() {
     do_node $client "lctl set_param debug=$PTLDEBUG;
         lctl set_param subsystem_debug=${SUBSYSTEM# };
         lctl set_param debug_mb=${DEBUG_SIZE}"
-    [ -d /r ] && $LCTL modules > /r/tmp/ogdb-$HOSTNAME
+    
     return 0
 }
 
index 19b7d41..6b2c150 100644 (file)
@@ -33,7 +33,7 @@ endif # UTILS
 lib_LIBRARIES = liblustreapi.a libiam.a
 
 lctl_SOURCES = obd.c lustre_cfg.c lctl.c obdctl.h
-lctl_LDADD := $(LIBREADLINE) $(LIBPTLCTL)
+lctl_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL)
 lctl_DEPENDENCIES := $(LIBPTLCTL)
 
 lfs_SOURCES = lfs.c obd.c lustre_cfg.c
@@ -41,7 +41,7 @@ lfs_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL)
 lfs_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a 
 
 loadgen_SOURCES = loadgen.c lustre_cfg.c obd.c
-loadgen_LDADD := $(LIBREADLINE) $(LIBPTLCTL) $(PTHREAD_LIBS)
+loadgen_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL) $(PTHREAD_LIBS)
 loadgen_DEPENDENCIES := $(LIBPTLCTL)
 
 if EXT2FS_DEVEL
index a1b58f6..264ffcc 100644 (file)
@@ -208,6 +208,24 @@ command_t cmdlist[] = {
          "get the device info of a attached file\n"
          "usage: blockdev_info <device_name>"},
 
+        /* Pool commands */
+        {"===  Pools ==", jt_noop, 0, "pool management"},
+        {"pool_new", jt_pool_cmd, 0,
+         "add a new pool\n"
+         "usage pool_new <fsname>.<poolname>"},
+        {"pool_add", jt_pool_cmd, 0,
+         "add the named OSTs to the pool\n"
+         "usage pool_add <fsname>.<poolname> <ostname indexed list>"},
+        {"pool_remove", jt_pool_cmd, 0,
+         "remove the named OST from the pool\n"
+         "usage pool_remove <fsname>.<poolname> <ostname indexed list>"},
+        {"pool_destroy", jt_pool_cmd, 0,
+         "destroy a pool\n"
+         "usage pool_destroy <fsname>.<poolname>"},
+        {"pool_list", jt_pool_cmd, 0,
+         "list pools and pools members\n"
+         "usage pool_list  <fsname>[.<poolname>] | <pathname>"},
+
         /* Test only commands */
         {"==== testing (DANGEROUS) ====", jt_noop, 0, "testing (DANGEROUS)"},
         {"--threads", jt_opt_threads, 0,
index 8d4246f..2479e93 100644 (file)
  * Author: Robert Read <rread@clusterfs.com>
  */
 
+/* for O_DIRECTORY */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <getopt.h>
@@ -94,6 +99,7 @@ static int lfs_rsetfacl(int argc, char **argv);
 static int lfs_rgetfacl(int argc, char **argv);
 static int lfs_cp(int argc, char **argv);
 static int lfs_ls(int argc, char **argv);
+static int lfs_poollist(int argc, char **argv);
 
 /* all avaialable commands */
 command_t cmdlist[] = {
@@ -101,30 +107,34 @@ command_t cmdlist[] = {
          "Create a new file with a specific striping pattern or\n"
          "set the default striping pattern on an existing directory or\n"
          "delete the default striping pattern from an existing directory\n"
-         "usage: setstripe <filename|dirname> <stripe_size> <stripe_index> <stripe_count>\n"
-         "       or \n"
-         "       setstripe <filename|dirname> [--size|-s stripe_size]\n"
-         "                                    [--index|-i stripe_index]\n"
-         "                                    [--count|-c stripe_count]\n"
+         "usage: setstripe [--size|-s stripe_size] [--offset|-o start_ost]\n"
+         "                 [--count|-c stripe_count] [--pool|-p pool_name]\n"
+         "                 <dir|filename>\n"
          "       or \n"
-         "       setstripe -d <dirname>   (to delete default striping)\n"
+         "       setstripe -d <dir>   (to delete default striping)\n"
          "\tstripe_size:  Number of bytes on each OST (0 filesystem default)\n"
-         "\t              Can be specified with k, m or g (in KB, MB and GB respectively)\n"
-         "\tstripe_index: OST index of first stripe (-1 filesystem default)\n"
-         "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)"},
+         "\t              Can be specified with k, m or g (in KB, MB and GB\n"
+         "\t              respectively)\n"
+         "\tstart_ost:    OST index of first stripe (-1 filesystem default)\n"
+         "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n"
+         "\tpool_name:    Name of OST pool"},
         {"getstripe", lfs_getstripe, 0,
-         "To list the striping info for a given filename or files in a\n"
+         "To list the striping info for a given file or files in a\n"
          "directory or recursively for all files in a directory tree.\n"
          "usage: getstripe [--obd|-O <uuid>] [--quiet | -q] [--verbose | -v]\n"
          "                 [--recursive | -r] <dir|file> ..."},
+        {"poollist", lfs_poollist, 0,
+         "List pools or pool OSTs\n"
+         "usage: poollist <fsname>[.<poolname>] | <pathname>\n"},
         {"find", lfs_find, 0,
          "To find files that match given parameters recursively in a directory tree.\n"
-         "usage: find <dir/file> ... \n"
+         "usage: find <dir|file> ... \n"
          "     [[!] --atime|-A [+-]N] [[!] --mtime|-M [+-]N] [[!] --ctime|-C [+-]N]\n"
          "     [--maxdepth|-D N] [[!] --name|-n <pattern>] [--print0|-P]\n"
          "     [--print|-p] [--obd|-O <uuid[s]>] [[!] --size|-s [+-]N[bkMGTP]]\n"
          "     [[!] --type|-t <filetype>] [[!] --gid|-g N] [[!] --group|-G <name>]\n"
          "     [[!] --uid|-u N] [[!] --user|-U <name>]\n"
+         "     [[!] --pool <name>]\n"
          "\t !: used before an option indicates 'NOT' the requested attribute\n"
          "\t -: used before an value indicates 'AT MOST' the requested value\n"
          "\t +: used before an option indicates 'AT LEAST' the requested value\n"},
@@ -203,12 +213,15 @@ static int lfs_setstripe(int argc, char **argv)
         char *stripe_size_arg = NULL;
         char *stripe_off_arg = NULL;
         char *stripe_count_arg = NULL;
+        char *pool_name_arg = NULL;
         unsigned long long size_units;
 
         struct option long_opts[] = {
                 {"size",        required_argument, 0, 's'},
                 {"count",       required_argument, 0, 'c'},
                 {"index",       required_argument, 0, 'i'},
+                {"offset",      required_argument, 0, 'o'},
+                {"pool",        required_argument, 0, 'p'},
                 {"delete",      no_argument,       0, 'd'},
                 {0, 0, 0, 0}
         };
@@ -221,7 +234,7 @@ static int lfs_setstripe(int argc, char **argv)
                  * usage */
                 fname = argv[2];
                 optind = 2;
-        } else if (argc == 5  && 
+        } else if (argc == 5  &&
                    (argv[2][0] != '-' || isdigit(argv[2][1])) &&
                    (argv[3][0] != '-' || isdigit(argv[3][1])) &&
                    (argv[4][0] != '-' || isdigit(argv[4][1])) ) {
@@ -234,7 +247,7 @@ static int lfs_setstripe(int argc, char **argv)
                 optind = 4;
         } else {
                 optind = 0;
-                while ((c = getopt_long(argc, argv, "c:di:s:",
+                while ((c = getopt_long(argc, argv, "c:di:o:s:p:",
                                                 long_opts, NULL)) >= 0) {
                         switch (c) {
                         case 0:
@@ -248,11 +261,15 @@ static int lfs_setstripe(int argc, char **argv)
                                 delete = 1;
                                 break;
                         case 'i':
+                        case 'o':
                                 stripe_off_arg = optarg;
                                 break;
                         case 's':
                                 stripe_size_arg = optarg;
                                 break;
+                        case 'p':
+                                pool_name_arg = optarg;
+                                break;
                         case '?':
                                 return CMD_HELP;
                         default:
@@ -268,11 +285,11 @@ static int lfs_setstripe(int argc, char **argv)
                         return CMD_HELP;
 
 
-                if (delete && 
-                    (stripe_size_arg != NULL || stripe_off_arg != NULL || 
-                     stripe_count_arg != NULL)) {
+                if (delete &&
+                    (stripe_size_arg != NULL || stripe_off_arg != NULL ||
+                     stripe_count_arg != NULL || pool_name_arg != NULL)) {
                         fprintf(stderr, "error: %s: cannot specify -d with "
-                                        "-s, -c or -i options\n",
+                                        "-s, -c -o or -p options\n",
                                         argv[0]);
                         return CMD_HELP;
                 }
@@ -312,7 +329,12 @@ static int lfs_setstripe(int argc, char **argv)
                 }
         }
 
-        result = llapi_file_create(fname, st_size, st_offset, st_count, 0);
+        if (pool_name_arg == NULL)
+                result = llapi_file_create(fname, st_size, st_offset, st_count, 0);
+        else
+                result = llapi_file_create_pool(fname, st_size, st_offset,
+                                                st_count, 0, pool_name_arg);
+
         if (result)
                 fprintf(stderr, "error: %s: create stripe file failed\n",
                                 argv[0]);
@@ -320,11 +342,19 @@ static int lfs_setstripe(int argc, char **argv)
         return result;
 }
 
+static int lfs_poollist(int argc, char **argv)
+{
+        if (argc != 2)
+                return CMD_HELP;
+
+        return llapi_poollist(argv[1]);
+}
+
 static int set_time(time_t *time, time_t *set, char *str)
 {
         time_t t;
         int res = 0;
-        
+
         if (str[0] == '+')
                 res = 1;
         else if (str[0] == '-')
@@ -399,6 +429,7 @@ static int id2name(char **name, unsigned int id, int type)
         return 0;
 }
 
+#define FIND_POOL_OPT 3
 static int lfs_find(int argc, char **argv)
 {
         int new_fashion = 1;
@@ -417,6 +448,8 @@ static int lfs_find(int argc, char **argv)
                 {"uid",       required_argument, 0, 'u'},
                 {"user",      required_argument, 0, 'U'},
                 {"name",      required_argument, 0, 'n'},
+                /* no short option for pool, p/P already used */
+                {"pool",      required_argument, 0, FIND_POOL_OPT},
                 /* --obd is considered as a new option. */
                 {"obd",       required_argument, 0, 'O'},
                 {"ost",       required_argument, 0, 'O'},
@@ -522,8 +555,8 @@ static int lfs_find(int argc, char **argv)
                         new_fashion = 1;
                         param.gid = strtol(optarg, &endptr, 10);
                         if (optarg == endptr) {
-                               ret = name2id(&param.gid, optarg, GRPQUOTA);
-                               if (ret != 0) {
+                                ret = name2id(&param.gid, optarg, GRPQUOTA);
+                                if (ret != 0) {
                                         fprintf(stderr, "Group/GID: %s cannot "
                                                 "be found.\n", optarg);
                                         return -1;
@@ -546,8 +579,8 @@ static int lfs_find(int argc, char **argv)
                         new_fashion = 1;
                         param.uid = strtol(optarg, &endptr, 10);
                         if (optarg == endptr) {
-                               ret = name2id(&param.uid, optarg, USRQUOTA);
-                               if (ret != 0) {
+                                ret = name2id(&param.uid, optarg, USRQUOTA);
+                                if (ret != 0) {
                                         fprintf(stderr, "User/UID: %s cannot "
                                                 "be found.\n", optarg);
                                         return -1;
@@ -556,6 +589,22 @@ static int lfs_find(int argc, char **argv)
                         param.exclude_uid = !!neg_opt;
                         param.check_uid = 1;
                         break;
+                case FIND_POOL_OPT:
+                        new_fashion = 1;
+                        if (strlen(optarg) > MAXPOOLNAME) {
+                                fprintf(stderr,
+                                        "Pool name %s is too long"
+                                        " (max is %d)\n", optarg,
+                                        MAXPOOLNAME);
+                                return -1;
+                        }
+                        /* we do check for empty pool because empty pool
+                         * is used to find V1 lov attributes */
+                        strncpy(param.poolname, optarg, MAXPOOLNAME);
+                        param.poolname[MAXPOOLNAME] = '\0';
+                        param.exclude_pool = !!neg_opt;
+                        param.check_pool = 1;
+                        break;
                 case 'n':
                         new_fashion = 1;
                         param.pattern = (char *)optarg;
@@ -667,7 +716,7 @@ static int lfs_find(int argc, char **argv)
                         return CMD_HELP;
                 };
         }
-        
+
         if (pathstart == -1) {
                 fprintf(stderr, "error: %s: no filename|pathname\n",
                         argv[0]);
@@ -689,7 +738,7 @@ static int lfs_find(int argc, char **argv)
                 if (!param.recursive && param.maxdepth == -1)
                         param.maxdepth = 1;
         }
-        
+
         do {
                 if (new_fashion)
                         ret = llapi_find(argv[pathstart], &param);
@@ -763,7 +812,7 @@ static int lfs_getstripe(int argc, char **argv)
         } while (++optind < argc && !rc);
 
         if (rc)
-                fprintf(stderr, "error: %s failed for %s.\n", 
+                fprintf(stderr, "error: %s failed for %s.\n",
                         argv[0], argv[optind - 1]);
         return rc;
 }
@@ -1440,7 +1489,7 @@ do {                                                                    \
  *        2. specifiers may be encountered multiple times (2s3s is 5 seconds)
  *        3. empty integer value is interpreted as 0
  */
+
 static unsigned long str2sec(const char* timestr) {
         const char spec[] = "smhdw";
         const unsigned long mult[] = {1, 60, 60*60, 24*60*60, 7*24*60*60};
@@ -1462,7 +1511,7 @@ static unsigned long str2sec(const char* timestr) {
 
                 v = strtoul(timestr, &tail, 10);
                 if (v == ULONG_MAX || *tail == '\0')
-                        /* value too large (ULONG_MAX or more) 
+                        /* value too large (ULONG_MAX or more)
                            or missing specifier */
                         goto error;
 
index 1adf2ec..f824151 100644 (file)
@@ -59,6 +59,7 @@
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <fnmatch.h>
+#include <glob.h>
 #ifdef HAVE_LINUX_UNISTD_H
 #include <linux/unistd.h>
 #else
@@ -209,61 +210,69 @@ int parse_size(char *optarg, unsigned long long *size,
         return 0;
 }
 
-int llapi_file_open(const char *name, int flags, int mode,
-                    unsigned long stripe_size, int stripe_offset,
-                    int stripe_count, int stripe_pattern)
+int llapi_stripe_limit_check(unsigned long stripe_size, int stripe_offset,
+                             int stripe_count, int stripe_pattern)
 {
-        struct lov_user_md lum = { 0 };
-        int fd, rc = 0;
-        int isdir = 0;
         int page_size;
 
-        fd = open(name, flags | O_LOV_DELAY_CREATE, mode);
-        if (fd < 0 && errno == EISDIR) {
-                fd = open(name, O_DIRECTORY | O_RDONLY);
-                isdir++;
-        }
-
-        if (fd < 0) {
-                rc = -errno;
-                llapi_err(LLAPI_MSG_ERROR, "unable to open '%s'", name);
-                return rc;
-        }
-
         /* 64 KB is the largest common page size I'm aware of (on ia64), but
          * check the local page size just in case. */
         page_size = LOV_MIN_STRIPE_SIZE;
         if (getpagesize() > page_size) {
                 page_size = getpagesize();
-                llapi_err_noerrno(LLAPI_MSG_WARN, 
+                llapi_err_noerrno(LLAPI_MSG_WARN,
                                   "warning: your page size (%u) is "
-                                  "larger than expected (%u)", page_size, 
+                                  "larger than expected (%u)", page_size,
                                   LOV_MIN_STRIPE_SIZE);
         }
         if (stripe_size < 0 || (stripe_size & (LOV_MIN_STRIPE_SIZE - 1))) {
-                errno = rc = -EINVAL;
                 llapi_err(LLAPI_MSG_ERROR, "error: bad stripe_size %lu, "
-                          "must be an even multiple of %d bytes", 
+                          "must be an even multiple of %d bytes",
                           stripe_size, page_size);
-                goto out;
+                return -EINVAL;
         }
         if (stripe_offset < -1 || stripe_offset > MAX_OBD_DEVICES) {
-                errno = rc = -EINVAL;
-                llapi_err(LLAPI_MSG_ERROR, "error: bad stripe offset %d", 
+                llapi_err(LLAPI_MSG_ERROR, "error: bad stripe offset %d",
                           stripe_offset);
-                goto out;
+                return -EINVAL;
         }
         if (stripe_count < -1 || stripe_count > LOV_MAX_STRIPE_COUNT) {
-                errno = rc = -EINVAL;
-                llapi_err(LLAPI_MSG_ERROR, "error: bad stripe count %d", 
+                llapi_err(LLAPI_MSG_ERROR, "error: bad stripe count %d",
                           stripe_count);
-                goto out;
+                return -EINVAL;
         }
         if (stripe_count > 0 && (__u64)stripe_size * stripe_count > 0xffffffff){
-                errno = rc = -EINVAL;
                 llapi_err(LLAPI_MSG_ERROR, "error: stripe_size %lu * "
-                          "stripe_count %u exceeds 4GB", stripe_size, 
+                          "stripe_count %u exceeds 4GB", stripe_size,
                           stripe_count);
+                return -EINVAL;
+        }
+        return 0;
+}
+
+int llapi_file_open(const char *name, int flags, int mode,
+                    unsigned long stripe_size, int stripe_offset,
+                    int stripe_count, int stripe_pattern)
+{
+        struct lov_user_md lum = { 0 };
+        int fd, rc = 0;
+        int isdir = 0;
+
+        fd = open(name, flags | O_LOV_DELAY_CREATE, mode);
+        if (fd < 0 && errno == EISDIR) {
+                fd = open(name, O_DIRECTORY | O_RDONLY);
+                isdir++;
+        }
+
+        if (fd < 0) {
+                rc = -errno;
+                llapi_err(LLAPI_MSG_ERROR, "unable to open '%s'", name);
+                return rc;
+        }
+
+        if ((rc = llapi_stripe_limit_check(stripe_size, stripe_offset,
+                                           stripe_count, stripe_pattern)) != 0) {
+                errno = rc;
                 goto out;
         }
 
@@ -293,6 +302,74 @@ out:
         return fd;
 }
 
+static int poolpath(char *fsname, char *pathname, char *pool_pathname);
+
+int llapi_file_open_pool(const char *name, int flags, int mode,
+                         unsigned long stripe_size, int stripe_offset,
+                         int stripe_count, int stripe_pattern, char *pool_name)
+{
+        struct lov_user_md_v3 lum = { 0 };
+        int fd, rc = 0;
+        int isdir = 0;
+        char fsname[MAX_OBD_NAME + 1], *ptr;
+
+        fd = open(name, flags | O_LOV_DELAY_CREATE, mode);
+        if (fd < 0 && errno == EISDIR) {
+                fd = open(name, O_DIRECTORY | O_RDONLY);
+                isdir++;
+        }
+
+        if (fd < 0) {
+                rc = -errno;
+                llapi_err(LLAPI_MSG_ERROR, "unable to open '%s'", name);
+                return rc;
+        }
+
+        if ((rc = llapi_stripe_limit_check(stripe_size, stripe_offset,
+                                           stripe_count, stripe_pattern)) != 0) {
+                errno = rc;
+                goto out;
+        }
+
+        /* in case user give the full pool name <fsname>.<poolname>, skip
+         * the fsname */
+        ptr = strchr(pool_name, '.');
+        if (ptr != NULL) {
+                strncpy(fsname, pool_name, ptr - pool_name);
+                fsname[ptr - pool_name] = '\0';
+                /* if fsname matches a fs skip it
+                 * if not keep the poolname as is */
+                if (poolpath(fsname, NULL, NULL) == 0)
+                        pool_name = ptr + 1;
+        }
+
+        /*  Initialize IOCTL striping pattern structure */
+        lum.lmm_magic = LOV_USER_MAGIC_V3;
+        lum.lmm_pattern = stripe_pattern;
+        lum.lmm_stripe_size = stripe_size;
+        lum.lmm_stripe_count = stripe_count;
+        lum.lmm_stripe_offset = stripe_offset;
+        strncpy(lum.lmm_pool_name, pool_name, MAXPOOLNAME);
+
+        if (ioctl(fd, LL_IOC_LOV_SETSTRIPE, &lum)) {
+                char *errmsg = "stripe already set";
+                rc = -errno;
+                if (errno != EEXIST && errno != EALREADY)
+                        errmsg = strerror(errno);
+
+                llapi_err_noerrno(LLAPI_MSG_ERROR,
+                                  "error on ioctl "LPX64" for '%s' (%d): %s",
+                                  (__u64)LL_IOC_LOV_SETSTRIPE, name, fd, errmsg);
+        }
+out:
+        if (rc) {
+                close(fd);
+                fd = rc;
+        }
+
+        return fd;
+}
+
 int llapi_file_create(const char *name, unsigned long stripe_size,
                       int stripe_offset, int stripe_count, int stripe_pattern)
 {
@@ -307,6 +384,202 @@ int llapi_file_create(const char *name, unsigned long stripe_size,
         return 0;
 }
 
+int llapi_file_create_pool(const char *name, unsigned long stripe_size,
+                           int stripe_offset, int stripe_count,
+                           int stripe_pattern, char *pool_name)
+{
+        int fd;
+
+        fd = llapi_file_open_pool(name, O_CREAT | O_WRONLY, 0644, stripe_size,
+                                  stripe_offset, stripe_count, stripe_pattern,
+                                  pool_name);
+        if (fd < 0)
+                return fd;
+
+        close(fd);
+        return 0;
+}
+
+
+static int print_pool_members(char *fs, char *pool_dir, char *pool_file)
+{
+        char path[PATH_MAX + 1];
+        char buf[1024];
+        FILE *fd;
+
+        llapi_printf(LLAPI_MSG_NORMAL, "Pool: %s.%s\n", fs, pool_file);
+        sprintf(path, "%s/%s", pool_dir, pool_file);
+        if ((fd = fopen(path, "r")) == NULL) {
+                llapi_err(LLAPI_MSG_ERROR, "Cannot open %s\n", path);
+                return -EINVAL;
+        }
+        while (fgets(buf, sizeof(buf), fd) != NULL)
+               llapi_printf(LLAPI_MSG_NORMAL, buf);
+
+        fclose(fd);
+        return 0;
+}
+
+/*
+ * search lustre fsname from pathname
+ *
+ */
+static int search_fsname(char *pathname, char *fsname)
+{
+        char *ptr;
+        FILE *fp;
+        struct mntent *mnt = NULL;
+
+        /* get the mount point */
+        fp = setmntent(MOUNTED, "r");
+        if (fp == NULL) {
+                 llapi_err(LLAPI_MSG_ERROR,
+                           "setmntent(%s) failed: %s:", MOUNTED,
+                           strerror (errno));
+                 return -EIO;
+        }
+        mnt = getmntent(fp);
+        while ((feof(fp) == 0) && ferror(fp) == 0) {
+                if (llapi_is_lustre_mnt(mnt)) {
+                        /* search by pathname */
+                        if (strncmp(mnt->mnt_dir, pathname,
+                                    strlen(mnt->mnt_dir)) == 0) {
+                                ptr = strchr(mnt->mnt_fsname, '/');
+                                if (ptr == NULL)
+                                        return -EINVAL;
+                                ptr++;
+                                strcpy(fsname, ptr);
+                                return 0;
+                        }
+                }
+                mnt = getmntent(fp);
+        }
+        endmntent(fp);
+        return -ENOENT;
+
+}
+
+/*
+ * find the pool directory path under /proc
+ * (can be also used to test if a fsname is known)
+ */
+static int poolpath(char *fsname, char *pathname, char *pool_pathname)
+{
+        int rc = 0;
+        glob_t glob_info;
+        char pattern[PATH_MAX + 1];
+        char buffer[PATH_MAX];
+
+        if (fsname == NULL) {
+                rc = search_fsname(pathname, buffer);
+                if (rc != 0)
+                        return rc;
+                fsname = buffer;
+                strcpy(pathname, fsname);
+        }
+
+        snprintf(pattern, PATH_MAX,
+                 "/proc/fs/lustre/lov/%s-*/pools",
+                 fsname);
+        rc = glob(pattern, GLOB_BRACE, NULL, &glob_info);
+        if (rc)
+                return -ENOENT;
+
+        if (glob_info.gl_pathc == 0) {
+                globfree(&glob_info);
+                return -ENOENT;
+        }
+
+        /* in fsname test mode, pool_pathname is NULL */
+        if (pool_pathname != NULL)
+                strcpy(pool_pathname, glob_info.gl_pathv[0]);
+
+        return 0;
+}
+
+int llapi_poollist(char *name)
+{
+        char *poolname;
+        char *fsname;
+        char rname[PATH_MAX + 1], pathname[PATH_MAX + 1];
+        char *ptr;
+        int rc = 0;
+
+        /* is name a pathname ? */
+        ptr = strchr(name, '/');
+        if (ptr != NULL) {
+                /* only absolute pathname is supported */
+                if (*name != '/')
+                        return -EINVAL;
+                if (!realpath(name, rname)) {
+                        rc = -errno;
+                        llapi_err(LLAPI_MSG_ERROR,
+                                  "llapi_poollist: invalid path '%s'",
+                                  name);
+                        return rc;
+                }
+
+                rc = poolpath(NULL, rname, pathname);
+                if (rc != 0) {
+                        errno = -rc;
+                        llapi_err(LLAPI_MSG_ERROR,
+                                  "llapi_poollist: '%s' is not"
+                                  " a Lustre filesystem",
+                                  name);
+                        return rc;
+                }
+                fsname = rname;
+                poolname = NULL;
+        } else {
+                /* name is FSNAME[.POOLNAME] */
+                fsname = name;
+                poolname = strchr(name, '.');
+                if (poolname != NULL) {
+                        *poolname = '\0';
+                        poolname++;
+                }
+                rc = poolpath(fsname, NULL, pathname);
+                if (rc != 0) {
+                        errno = -rc;
+                        llapi_err(LLAPI_MSG_ERROR,
+                                  "llapi_poollist: Lustre filesystem '%s'"
+                                  " not found", name);
+                        return rc;
+                }
+        }
+        if (rc != 0) {
+                errno = -rc;
+                llapi_err(LLAPI_MSG_ERROR,
+                          "llapi_poollist: Lustre filesystem '%s' not found",
+                          name);
+                return rc;
+        }
+
+        if (poolname != NULL) {
+                rc = print_pool_members(fsname, pathname, poolname);
+                poolname--;
+                *poolname = '.';
+        } else {
+                DIR *dir;
+                struct dirent *pool;
+
+                llapi_printf(LLAPI_MSG_NORMAL, "Pools from %s:\n", fsname);
+                if ((dir = opendir(pathname)) == NULL) {
+                        return -EINVAL;
+                }
+                while ((pool = readdir(dir)) != NULL) {
+                        if (!((pool->d_name[0] == '.') &&
+                              (pool->d_name[1] == '\0')) &&
+                            !((pool->d_name[0] == '.') &&
+                              (pool->d_name[1] == '.') &&
+                              (pool->d_name[2] == '\0')))
+                        llapi_printf(LLAPI_MSG_NORMAL, " %s.%s\n", fsname, pool->d_name);
+                }
+                closedir(dir);
+        }
+        return rc;
+}
+
 typedef int (semantic_func_t)(char *path, DIR *parent, DIR *d,
                               void *data, cfs_dirent_t *de);
 
@@ -315,9 +588,9 @@ typedef int (semantic_func_t)(char *path, DIR *parent, DIR *d,
 
 static int common_param_init(struct find_param *param)
 {
-        param->lumlen = lov_mds_md_size(MAX_LOV_UUID_COUNT);
+        param->lumlen = lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC_V3);
         if ((param->lmd = malloc(sizeof(lstat_t) + param->lumlen)) == NULL) {
-                llapi_err(LLAPI_MSG_ERROR, 
+                llapi_err(LLAPI_MSG_ERROR,
                           "error: allocation of %d bytes for ioctl",
                           sizeof(lstat_t) + param->lumlen);
                 return -ENOMEM;
@@ -338,7 +611,7 @@ static void find_param_fini(struct find_param *param)
                 free(param->lmd);
 }
 
-int llapi_file_get_lov_fuuid(int fd, struct obd_uuid *lov_name)
+int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_name)
 {
         int rc = ioctl(fd, OBD_IOC_GETNAME, lov_name);
         if (rc) {
@@ -355,11 +628,11 @@ int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid)
         fd = open(path, O_RDONLY);
         if (fd < 0) {
                 rc = errno;
-                llapi_err(LLAPI_MSG_ERROR, "error opening %s\n", path);
+                llapi_err(LLAPI_MSG_ERROR, "error opening %s", path);
                 return rc;
         }
 
-        rc = llapi_file_get_lov_fuuid(fd, lov_uuid);
+        rc = llapi_file_fget_lov_uuid(fd, lov_uuid);
 
         close(fd);
 
@@ -380,7 +653,7 @@ int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count)
         int rc = 0, index = 0;
 
         /* Get the lov name */
-        rc = llapi_file_get_lov_fuuid(fd, &lov_name);
+        rc = llapi_file_fget_lov_uuid(fd, &lov_name);
         if (rc)
                 return rc;
 
@@ -422,11 +695,11 @@ static int setup_obd_uuid(DIR *dir, char *dname, struct find_param *param)
         int rc = 0, index;
 
         /* Get the lov name */
-        rc = llapi_file_get_lov_fuuid(dirfd(dir), &lov_uuid);
+        rc = llapi_file_fget_lov_uuid(dirfd(dir), &lov_uuid);
         if (rc) {
                 if (errno != ENOTTY) {
                         rc = errno;
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                   "error: can't get lov name: %s", dname);
                 } else {
                         rc = 0;
@@ -470,7 +743,7 @@ static int setup_obd_uuid(DIR *dir, char *dname, struct find_param *param)
 
         if (!param->quiet && param->obduuid &&
             (param->obdindex == OBD_NOT_FOUND)) {
-                llapi_err_noerrno(LLAPI_MSG_ERROR, 
+                llapi_err_noerrno(LLAPI_MSG_ERROR,
                                   "error: %s: unknown obduuid: %s",
                                   __FUNCTION__, param->obduuid->uuid);
                 //rc = EINVAL;
@@ -539,14 +812,16 @@ retry_get_uuids:
         return 0;
 }
 
-void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *path, int is_dir,
-                          int obdindex, int quiet, int header, int body)
+void lov_dump_user_lmm_v1v3(struct lov_user_md *lum, char *pool_name,
+                            struct lov_user_ost_data_v1 *objects,
+                            char *path, int is_dir,
+                            int obdindex, int quiet, int header, int body)
 {
         int i, obdstripe = 0;
 
         if (obdindex != OBD_NOT_FOUND) {
                 for (i = 0; !is_dir && i < lum->lmm_stripe_count; i++) {
-                        if (obdindex == lum->lmm_objects[i].l_ost_idx) {
+                        if (obdindex == objects[i].l_ost_idx) {
                                 llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path);
                                 obdstripe = 1;
                                 break;
@@ -564,44 +839,49 @@ void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *path, int is_dir,
                                 llapi_printf(LLAPI_MSG_NORMAL, "(Default) ");
                                 lum->lmm_object_gr = LOV_OBJECT_GROUP_CLEAR;
                         }
-                        llapi_printf(LLAPI_MSG_NORMAL, 
+                        llapi_printf(LLAPI_MSG_NORMAL,
                                      "stripe_count: %d stripe_size: %u "
-                                     "stripe_offset: %d\n",
+                                     "stripe_offset: %d%s%s\n",
                                      lum->lmm_stripe_count == (__u16)-1 ? -1 :
-                                     lum->lmm_stripe_count,
+                                        lum->lmm_stripe_count,
                                      lum->lmm_stripe_size,
                                      lum->lmm_stripe_offset == (__u16)-1 ? -1 :
-                                     lum->lmm_stripe_offset);
+                                        lum->lmm_stripe_offset,
+                                     pool_name != NULL ? " pool: " : "",
+                                     pool_name != NULL ? pool_name : "");
                 }
                 return;
         }
 
         if (header && (obdstripe == 1)) {
-                llapi_printf(LLAPI_MSG_NORMAL, 
+                llapi_printf(LLAPI_MSG_NORMAL,
                              "lmm_magic:          0x%08X\n",  lum->lmm_magic);
-                llapi_printf(LLAPI_MSG_NORMAL, 
+                llapi_printf(LLAPI_MSG_NORMAL,
                              "lmm_object_gr:      "LPX64"\n", lum->lmm_object_gr);
-                llapi_printf(LLAPI_MSG_NORMAL, 
+                llapi_printf(LLAPI_MSG_NORMAL,
                              "lmm_object_id:      "LPX64"\n", lum->lmm_object_id);
-                llapi_printf(LLAPI_MSG_NORMAL, 
+                llapi_printf(LLAPI_MSG_NORMAL,
                              "lmm_stripe_count:   %u\n", (int)lum->lmm_stripe_count);
-                llapi_printf(LLAPI_MSG_NORMAL, 
+                llapi_printf(LLAPI_MSG_NORMAL,
                              "lmm_stripe_size:    %u\n",      lum->lmm_stripe_size);
-                llapi_printf(LLAPI_MSG_NORMAL, 
+                llapi_printf(LLAPI_MSG_NORMAL,
                              "lmm_stripe_pattern: %x\n",      lum->lmm_pattern);
+                if (pool_name != NULL)
+                        llapi_printf(LLAPI_MSG_NORMAL,
+                                     "lmm_pool_name:      %s\n",      pool_name);
         }
 
         if (body) {
                 if ((!quiet) && (obdstripe == 1))
-                        llapi_printf(LLAPI_MSG_NORMAL, 
+                        llapi_printf(LLAPI_MSG_NORMAL,
                                      "\tobdidx\t\t objid\t\tobjid\t\t group\n");
 
                 for (i = 0; i < lum->lmm_stripe_count; i++) {
-                        int idx = lum->lmm_objects[i].l_ost_idx;
-                        long long oid = lum->lmm_objects[i].l_object_id;
-                        long long gr = lum->lmm_objects[i].l_object_gr;
+                        int idx = objects[i].l_ost_idx;
+                        long long oid = objects[i].l_object_id;
+                        long long gr = objects[i].l_object_gr;
                         if ((obdindex == OBD_NOT_FOUND) || (obdindex == idx))
-                                llapi_printf(LLAPI_MSG_NORMAL, 
+                                llapi_printf(LLAPI_MSG_NORMAL,
                                              "\t%6u\t%14llu\t%#13llx\t%14llu%s\n",
                                              idx, oid, oid, gr,
                                              obdindex == idx ? " *" : "");
@@ -631,13 +911,13 @@ void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path,
         }
 
         if (header && obdstripe == 1) {
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic:          0x%08X\n",  
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic:          0x%08X\n",
                              lumj->lmm_magic);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr:      "LPX64"\n", 
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr:      "LPX64"\n",
                              lumj->lmm_object_gr);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id:      "LPX64"\n", 
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id:      "LPX64"\n",
                              lumj->lmm_object_id);
-                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count:   %u\n", 
+                llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count:   %u\n",
                              (int)lumj->lmm_stripe_count);
                 llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_size:    %u\n",
                              lumj->lmm_stripe_size);
@@ -650,7 +930,7 @@ void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path,
         if (body) {
                 unsigned long long start = -1, end = 0;
                 if (!quiet && obdstripe == 1)
-                        llapi_printf(LLAPI_MSG_NORMAL, 
+                        llapi_printf(LLAPI_MSG_NORMAL,
                                      "joined\tobdidx\t\t objid\t\tobjid\t\t group"
                                      "\t\tstart\t\tend\n");
                 for (i = 0; i < lumj->lmm_stripe_count; i++) {
@@ -658,7 +938,7 @@ void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path,
                         long long oid = lumj->lmm_objects[i].l_object_id;
                         long long gr = lumj->lmm_objects[i].l_object_gr;
                         if (obdindex == OBD_NOT_FOUND || obdindex == idx)
-                                llapi_printf(LLAPI_MSG_NORMAL, 
+                                llapi_printf(LLAPI_MSG_NORMAL,
                                              "\t%6u\t%14llu\t%#13llx\t%14llu%s",
                                              idx, oid, oid, gr,
                                              obdindex == idx ? " *" : "");
@@ -668,10 +948,10 @@ void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path,
                                 llapi_printf(LLAPI_MSG_NORMAL, "\t%14llu", start);
                                 end = lumj->lmm_objects[i].l_extent_end;
                                 if (end == (unsigned long long)-1)
-                                        llapi_printf(LLAPI_MSG_NORMAL, 
+                                        llapi_printf(LLAPI_MSG_NORMAL,
                                                      "\t\tEOF\n");
                                 else
-                                        llapi_printf(LLAPI_MSG_NORMAL, 
+                                        llapi_printf(LLAPI_MSG_NORMAL,
                                                      "\t\t%llu\n", end);
                         } else {
                                 llapi_printf(LLAPI_MSG_NORMAL, "\t\t\t\t\n");
@@ -686,10 +966,12 @@ void llapi_lov_dump_user_lmm(struct find_param *param,
 {
         switch(*(__u32 *)&param->lmd->lmd_lmm) { /* lum->lmm_magic */
         case LOV_USER_MAGIC_V1:
-                lov_dump_user_lmm_v1(&param->lmd->lmd_lmm, path, is_dir,
-                                      param->obdindex, param->quiet,
-                                      param->verbose,
-                                      (param->verbose || !param->obduuid));
+                lov_dump_user_lmm_v1v3(&param->lmd->lmd_lmm, NULL,
+                                       param->lmd->lmd_lmm.lmm_objects,
+                                       path, is_dir,
+                                       param->obdindex, param->quiet,
+                                       param->verbose,
+                                       (param->verbose || !param->obduuid));
                 break;
         case LOV_USER_MAGIC_JOIN:
                 lov_dump_user_lmm_join(&param->lmd->lmd_lmm, path, is_dir,
@@ -697,10 +979,28 @@ void llapi_lov_dump_user_lmm(struct find_param *param,
                                        param->verbose,
                                        (param->verbose || !param->obduuid));
                 break;
+        case LOV_USER_MAGIC_V3: {
+                char pool_name[MAXPOOLNAME + 1];
+                struct lov_user_ost_data_v1 *objects;
+
+                strncpy(pool_name,
+                        ((struct lov_user_md_v3 *)(&param->lmd->lmd_lmm))->lmm_pool_name,
+                        MAXPOOLNAME);
+                pool_name[MAXPOOLNAME] = '\0';
+                objects = ((struct lov_user_md_v3 *)(&param->lmd->lmd_lmm))->lmm_objects;
+                lov_dump_user_lmm_v1v3(&param->lmd->lmd_lmm, pool_name,
+                                      objects, path, is_dir,
+                                      param->obdindex, param->quiet,
+                                      param->verbose,
+                                      (param->verbose || !param->obduuid));
+                break;
+        }
         default:
-                llapi_printf(LLAPI_MSG_NORMAL, 
-                             "unknown lmm_magic:  %#x (expecting %#x)\n",
-                       *(__u32 *)&param->lmd->lmd_lmm, LOV_USER_MAGIC_V1);
+                llapi_printf(LLAPI_MSG_NORMAL, "unknown lmm_magic:  %#x "
+                             "(expecting one of %#x %#x %#x)\n",
+                             *(__u32 *)&param->lmd->lmd_lmm,
+                             LOV_USER_MAGIC_V1, LOV_USER_MAGIC_JOIN,
+                             LOV_USER_MAGIC_V3);
                 return;
         }
 }
@@ -785,7 +1085,8 @@ int llapi_mds_getfileinfo(char *path, DIR *parent,
 
         fname = (fname == NULL ? path : fname + 1);
         /* retrieve needed file info */
-        strncpy((char *)lmd, fname, lov_mds_md_size(MAX_LOV_UUID_COUNT));
+        strncpy((char *)lmd, fname,
+                lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC));
         ret = ioctl(dirfd(parent), IOC_MDC_GETFILEINFO, (void *)lmd);
 
         if (ret) {
@@ -794,18 +1095,18 @@ int llapi_mds_getfileinfo(char *path, DIR *parent,
                          * Do the regular lstat(2) instead. */
                         ret = lstat_f(path, st);
                         if (ret) {
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                           "error: %s: lstat failed for %s",
                                           __FUNCTION__, path);
                                 return ret;
                         }
                 } else if (errno == ENOENT) {
-                        llapi_err(LLAPI_MSG_WARN, 
-                                  "warning: %s: %s does not exist", 
+                        llapi_err(LLAPI_MSG_WARN,
+                                  "warning: %s: %s does not exist",
                                   __FUNCTION__, path);
                         return -ENOENT;
                 } else {
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                   "error: %s: IOC_MDC_GETFILEINFO failed for %s",
                                   __FUNCTION__, path);
                         return ret;
@@ -894,7 +1195,7 @@ static int llapi_semantic_traverse(char *path, int size, DIR *parent,
 
                 switch (dent->d_type) {
                 case DT_UNKNOWN:
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                   "error: %s: '%s' is UNKNOWN type %d",
                                   __FUNCTION__, dent->d_name, dent->d_type);
                         break;
@@ -1089,13 +1390,13 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir,
                         lustre_fs = 0;
                         ret = lstat_f(path, st);
                         if (ret) {
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                           "error: %s: lstat failed for %s",
                                           __FUNCTION__, path);
                                 return ret;
                         }
                 } else if (errno == ENOENT) {
-                        llapi_err(LLAPI_MSG_WARN, 
+                        llapi_err(LLAPI_MSG_WARN,
                                   "warning: %s: %s does not exist",
                                   __FUNCTION__, path);
                         goto decided;
@@ -1155,11 +1456,21 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir,
                         goto decided;
                 } else {
                         int i, j;
+                        struct lov_user_ost_data_v1 *lmm_objects;
+
+                        if (param->lmd->lmd_lmm.lmm_magic ==
+                            LOV_USER_MAGIC_V3) {
+                                lmm_objects =
+                                 ((struct lov_user_md_v3 *)(&(param->lmd->lmd_lmm)))->lmm_objects;
+                        } else {
+                                lmm_objects = param->lmd->lmd_lmm.lmm_objects;
+                        }
+
                         for (i = 0;
                              i < param->lmd->lmd_lmm.lmm_stripe_count; i++) {
                                 for (j = 0; j < param->num_obds; j++) {
                                         if (param->obdindexes[j] ==
-                                            param->lmd->lmd_lmm.lmm_objects[i].l_ost_idx)
+                                            lmm_objects[i].l_ost_idx)
                                                 goto obd_matches;
                                 }
                         }
@@ -1189,6 +1500,23 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir,
                 }
         }
 
+        if (param->check_pool) {
+                /* empty requested pool is taken as no pool search => V1 */
+                if (((param->lmd->lmd_lmm.lmm_magic == LOV_USER_MAGIC_V1) &&
+                     (param->poolname[0] == '\0')) ||
+                    ((param->lmd->lmd_lmm.lmm_magic == LOV_USER_MAGIC_V3) &&
+                     (strncmp(((struct lov_user_md_v3 *)(&(param->lmd->lmd_lmm)))->lmm_pool_name,
+                              param->poolname, MAXPOOLNAME) == 0)) ||
+                    ((param->lmd->lmd_lmm.lmm_magic == LOV_USER_MAGIC_V3) &&
+                     (strcmp(param->poolname, "*") == 0))) {
+                        if (param->exclude_pool)
+                                goto decided;
+                } else {
+                        if (!param->exclude_pool)
+                                goto decided;
+                }
+        }
+
         /* Check the time on mds. */
         if (!decision) {
                 int for_mds;
@@ -1208,7 +1536,7 @@ obd_matches:
                 if (param->obdindex != OBD_NOT_FOUND) {
                         /* Check whether the obd is active or not, if it is
                          * not active, just print the object affected by this
-                         * failed ost 
+                         * failed ost
                          * */
                         struct obd_statfs stat_buf;
                         struct obd_uuid uuid_buf;
@@ -1216,15 +1544,15 @@ obd_matches:
                         memset(&stat_buf, 0, sizeof(struct obd_statfs));
                         memset(&uuid_buf, 0, sizeof(struct obd_uuid));
                         ret = llapi_obd_statfs(path, LL_STATFS_LOV,
-                                               param->obdindex, &stat_buf, 
+                                               param->obdindex, &stat_buf,
                                                &uuid_buf);
                         if (ret) {
-                                if (ret == -ENODATA || ret == -ENODEV 
+                                if (ret == -ENODATA || ret == -ENODEV
                                     || ret == -EIO)
                                         errno = EIO;
-                                llapi_printf(LLAPI_MSG_NORMAL, 
+                                llapi_printf(LLAPI_MSG_NORMAL,
                                              "obd_uuid: %s failed %s ",
-                                             param->obduuid->uuid, 
+                                             param->obduuid->uuid,
                                              strerror(errno));
                                 goto print_path;
                         }
@@ -1239,12 +1567,12 @@ obd_matches:
 
                 if (ret) {
                         if (errno == ENOENT) {
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                           "warning: %s: %s does not exist",
                                           __FUNCTION__, path);
                                 goto decided;
                         } else {
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                           "%s: IOC_LOV_GETINFO on %s failed",
                                           __FUNCTION__, path);
                                 return ret;
@@ -1350,20 +1678,20 @@ static int cb_getstripe(char *path, DIR *parent, DIR *d, void *data,
         if (ret) {
                 if (errno == ENODATA) {
                         if (!param->obduuid && !param->quiet)
-                                llapi_printf(LLAPI_MSG_NORMAL, 
+                                llapi_printf(LLAPI_MSG_NORMAL,
                                              "%s has no stripe info\n", path);
                         goto out;
                 } else if (errno == ENOTTY) {
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                   "%s: '%s' not on a Lustre fs?",
                                   __FUNCTION__, path);
                 } else if (errno == ENOENT) {
-                        llapi_err(LLAPI_MSG_WARN, 
+                        llapi_err(LLAPI_MSG_WARN,
                                   "warning: %s: %s does not exist",
                                   __FUNCTION__, path);
                         goto out;
                 } else {
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                   "error: %s: %s failed for %s",
                                    __FUNCTION__, d ? "LL_IOC_LOV_GETSTRIPE" :
                                   "IOC_MDC_GETFILESTRIPE", path);
@@ -1388,7 +1716,7 @@ int llapi_getstripe(char *path, struct find_param *param)
         int ret = 0, len = strlen(path);
 
         if (len > PATH_MAX) {
-                llapi_err(LLAPI_MSG_ERROR, 
+                llapi_err(LLAPI_MSG_ERROR,
                           "%s: Path name '%s' is too long",
                           __FUNCTION__, path);
                 return -EINVAL;
@@ -1434,7 +1762,7 @@ int llapi_obd_statfs(char *path, __u32 type, __u32 index,
         data.ioc_plen2 = sizeof(struct obd_uuid);
 
         if ((rc = obd_ioctl_pack(&data, &rawbuf, sizeof(raw))) != 0) {
-                llapi_err(LLAPI_MSG_ERROR, 
+                llapi_err(LLAPI_MSG_ERROR,
                           "llapi_obd_statfs: error packing ioctl data");
                 return rc;
         }
@@ -1445,7 +1773,7 @@ int llapi_obd_statfs(char *path, __u32 type, __u32 index,
 
         if (fd < 0) {
                 rc = errno ? -errno : -EBADF;
-                llapi_err(LLAPI_MSG_ERROR, "error: %s: opening '%s'", 
+                llapi_err(LLAPI_MSG_ERROR, "error: %s: opening '%s'",
                           __FUNCTION__, path);
                 return rc;
         }
@@ -1694,11 +2022,11 @@ static int cb_quotachown(char *path, DIR *parent, DIR *d, void *data,
         if (rc) {
                 if (errno == ENODATA) {
                         if (!param->obduuid && !param->quiet)
-                                llapi_err(LLAPI_MSG_ERROR, 
+                                llapi_err(LLAPI_MSG_ERROR,
                                           "%s has no stripe info", path);
                         rc = 0;
                 } else if (errno == ENOENT) {
-                        llapi_err(LLAPI_MSG_ERROR, 
+                        llapi_err(LLAPI_MSG_ERROR,
                                   "warning: %s: %s does not exist",
                                   __FUNCTION__, path);
                         rc = 0;
index f173b56..d1fd956 100644 (file)
@@ -398,6 +398,26 @@ void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip)
                        createtime, canceltime);
                 break;
         }
+        case(LCFG_POOL_NEW):{
+                printf("pool new ");
+                print_1_cfg(lcfg);
+                break;
+        }
+        case(LCFG_POOL_ADD):{
+                printf("pool add ");
+                print_1_cfg(lcfg);
+                break;
+        }
+        case(LCFG_POOL_REM):{
+                printf("pool remove ");
+                print_1_cfg(lcfg);
+                break;
+        }
+        case(LCFG_POOL_DEL):{
+                printf("pool destroy ");
+                print_1_cfg(lcfg);
+                break;
+        }
         default:
                 printf("unsupported cmd_code = %x\n",cmd);
         }
index 00ccd91..5a5bde3 100644 (file)
@@ -52,6 +52,7 @@
 #include <stdarg.h>
 #include <signal.h>
 #include <ctype.h>
+#include <glob.h>
 
 #include "obdctl.h"
 
@@ -69,6 +70,7 @@
 #include <lnet/lnetctl.h>
 #include <libcfs/libcfsutil.h>
 #include <stdio.h>
+#include <lustre/liblustreapi.h>
 
 #define MAX_STRING_SIZE 128
 #define DEVICES_LIST "/proc/fs/lustre/devices"
@@ -164,42 +166,51 @@ int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg)
 
 static int do_device(char *func, char *devname);
 
-int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg)
+static int get_mgs_device()
 {
-        struct obd_ioctl_data data;
-        static int mgs_device = -1;
         char mgs[] = "$MGS";
-        int rc;
+        static int mgs_device = -1;
 
-        /* Always operates on MGS dev */
         if (mgs_device == -1) {
+                int rc;
                 do_disconnect(NULL, 1);
                 rc = do_device("mgsioc", mgs);
                 if (rc) {
+                        fprintf(stderr,
+                                "This command must be run on the MGS.\n");
                         errno = ENODEV;
                         return -1;
                 }
                 mgs_device = cur_device;
         }
+        return mgs_device;
+}
+
+/* Returns -1 on error with errno set */
+int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg)
+{
+        struct obd_ioctl_data data;
+        int rc;
 
         IOC_INIT(data);
-        data.ioc_dev = mgs_device;
+        rc = data.ioc_dev = get_mgs_device();
+        if (rc < 0)
+                goto out;
         data.ioc_type = LUSTRE_CFG_TYPE;
         data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount,
                                         lcfg->lcfg_buflens);
         data.ioc_pbuf1 = (void *)lcfg;
         IOC_PACK(func, data);
 
-        rc =  l_ioctl(dev_id, OBD_IOC_PARAM, buf);
-
-        if (rc == ENODEV)
-                fprintf(stderr, "Is the MGS running on this node?\n");
-        if (rc == ENOSYS)
-                fprintf(stderr, "Make sure cfg_device is set first.\n");
-        if (rc == EINVAL)
-                fprintf(stderr, "cfg_device should be of the form "
-                        "'lustre-MDT0000'\n");
-
+        rc = l_ioctl(dev_id, OBD_IOC_PARAM, buf);
+out:
+        if (rc) {
+                if (errno == ENOSYS)
+                        fprintf(stderr, "Make sure cfg_device is set first.\n");
+                if (errno == EINVAL)
+                        fprintf(stderr, "cfg_device should be of the form "
+                                "'lustre-MDT0000'\n");
+        }
         return rc;
 }
 
@@ -2334,3 +2345,591 @@ void obd_finalize(int argc, char **argv)
         shmem_stop();
         do_disconnect(argv[0], 1);
 }
+
+static int find_target_obdpath(char *fsname, char *path)
+{
+        glob_t glob_info;
+        char pattern[PATH_MAX + 1];
+        int rc;
+
+        snprintf(pattern, PATH_MAX,
+                 "/proc/fs/lustre/lov/%s-*/target_obd",
+                 fsname);
+        rc = glob(pattern, GLOB_BRACE, NULL, &glob_info);
+        if (rc)
+                return -EINVAL;
+
+        if (glob_info.gl_pathc == 0) {
+                globfree(&glob_info);
+                return -EINVAL;
+        }
+
+        strcpy(path, glob_info.gl_pathv[0]);
+        return 0;
+}
+
+static int find_poolpath(char *fsname, char *poolname, char *poolpath)
+{
+        glob_t glob_info;
+        char pattern[PATH_MAX + 1];
+        int rc;
+
+        snprintf(pattern, PATH_MAX,
+                 "/proc/fs/lustre/lov/%s-*/pools/%s",
+                 fsname, poolname);
+        rc = glob(pattern, GLOB_BRACE, NULL, &glob_info);
+        if (rc)
+                return -EINVAL;
+
+        if (glob_info.gl_pathc == 0) {
+                globfree(&glob_info);
+                return -EINVAL;
+        }
+
+        strcpy(poolpath, glob_info.gl_pathv[0]);
+        return 0;
+}
+
+/*
+ * if pool is NULL, search ostname in target_obd
+ * if pool is no NULL
+ *  if pool not found returns < 0
+ *  if ostname is NULL, returns 1 if pool is not empty and 0 if pool empty
+ *  if ostname is not NULL, returns 1 if OST is in pool and 0 if not
+ */
+static int search_ost(char *fsname, char *poolname, char *ostname)
+{
+        FILE *fd;
+        char buffer[PATH_MAX + 1];
+        int len = 0, rc;
+
+        if (ostname != NULL)
+                len = strlen(ostname);
+
+        if (poolname == NULL)
+                rc = find_target_obdpath(fsname, buffer);
+        else
+                rc = find_poolpath(fsname, poolname, buffer);
+        if (rc)
+                return rc;
+
+        if ((fd = fopen(buffer, "r")) == NULL)
+                return -EINVAL;
+
+        while (fgets(buffer, sizeof(buffer), fd) != NULL) {
+                if (poolname == NULL) {
+                        /* we search ostname in target_obd */
+                        if (strncmp(buffer + 3, ostname, len) == 0) {
+                                fclose(fd);
+                                return 1;
+                        }
+                } else {
+                        /* we search a non empty pool or
+                           an ostname in a pool */
+                        if ((ostname == NULL) ||
+                            (strncmp(buffer, ostname, len) == 0)) {
+                                fclose(fd);
+                                return 1;
+                        }
+                }
+        }
+        fclose(fd);
+        return 0;
+}
+
+static int check_pool_cmd(enum lcfg_command_type cmd,
+                          char *fsname, char *poolname,
+                          char *ostname)
+{
+        int rc = 0;
+
+        switch (cmd) {
+        case LCFG_POOL_NEW: {
+                if (search_ost(fsname, poolname, NULL) >= 0) {
+                        fprintf(stderr, "Pool %s.%s already exists\n",
+                                fsname, poolname);
+                        return -EEXIST;
+                }
+                return 0;
+        }
+        case LCFG_POOL_DEL: {
+                rc = search_ost(fsname, poolname, NULL);
+                if (rc < 0) {
+                        fprintf(stderr, "Pool %s.%s not found\n",
+                                fsname, poolname);
+                        return -ENOENT;
+                }
+                if (rc == 1) {
+                        fprintf(stderr, "Pool %s.%s not empty, "
+                                "please remove all members\n",
+                                fsname, poolname);
+                        return -ENOTEMPTY;
+                }
+                return 0;
+        }
+        case LCFG_POOL_ADD: {
+                rc = search_ost(fsname, NULL, ostname);
+                if (rc == 0) {
+                        fprintf(stderr, "OST %s not found in lov of %s\n",
+                                ostname, fsname);
+                        return -ENOENT;
+                }
+                rc = search_ost(fsname, poolname, ostname);
+                if (rc < 0) {
+                        fprintf(stderr, "Pool %s.%s not found\n",
+                                fsname, poolname);
+                        return -ENOENT;
+                }
+                if (rc == 1) {
+                        fprintf(stderr, "OST %s already in pool %s.%s\n",
+                                ostname, fsname, poolname);
+                        return -EEXIST;
+                }
+                return 0;
+        }
+        case LCFG_POOL_REM: {
+                rc = search_ost(fsname, poolname, ostname);
+                if (rc < 0) {
+                        fprintf(stderr, "Pool %s.%s not found\n",
+                                fsname, poolname);
+                        return -ENOENT;
+                }
+                if (rc == 0) {
+                        fprintf(stderr, "OST %s not found in pool %s.%s\n",
+                                ostname, fsname, poolname);
+                        return -ENOENT;
+                }
+                return 0;
+        }
+        default: {
+        }
+        }
+        return 0;
+}
+
+static void check_pool_cmd_result(enum lcfg_command_type cmd,
+                                  char *fsname, char *poolname,
+                                  char *ostname)
+{
+        int cpt, rc = 0;
+
+        cpt = 10;
+        switch (cmd) {
+        case LCFG_POOL_NEW: {
+                do {
+                        rc = search_ost(fsname, poolname, NULL);
+                        if (rc < 0)
+                                sleep(2);
+                        cpt--;
+                } while ((rc < 0) && (cpt > 0));
+                if (rc >= 0)
+                        fprintf(stderr, "Pool %s.%s created\n",
+                                fsname, poolname);
+                else
+                        fprintf(stderr, "Warning, pool %s.%s not found\n",
+                                fsname, poolname);
+                return;
+        }
+        case LCFG_POOL_DEL: {
+                do {
+                         rc = search_ost(fsname, poolname, NULL);
+                         if (rc >= 0)
+                                sleep(2);
+                         cpt--;
+                } while ((rc >= 0) && (cpt > 0));
+                if (rc < 0)
+                        fprintf(stderr, "Pool %s.%s destroyed\n",
+                                fsname, poolname);
+                else
+                        fprintf(stderr, "Warning, pool %s.%s still found\n",
+                                fsname, poolname);
+                return;
+        }
+        case LCFG_POOL_ADD: {
+                do {
+                        rc = search_ost(fsname, poolname, ostname);
+                        if (rc != 1)
+                                sleep(2);
+                        cpt--;
+                } while ((rc != 1) && (cpt > 0));
+                if (rc == 1)
+                        fprintf(stderr, "OST %s added to pool %s.%s\n",
+                                ostname, fsname, poolname);
+                else
+                        fprintf(stderr, "Warning, OST %s not found in pool %s.%s\n",
+                                ostname, fsname, poolname);
+                return;
+        }
+        case LCFG_POOL_REM: {
+                do {
+                        rc = search_ost(fsname, poolname, ostname);
+                        if (rc == 1)
+                                sleep(2);
+                        cpt--;
+                } while ((rc == 1) && (cpt > 0));
+                if (rc != 1)
+                        fprintf(stderr, "OST %s removed from pool %s.%s\n",
+                                ostname, fsname, poolname);
+                else
+                        fprintf(stderr, "Warning, OST %s still found in pool %s.%s\n",
+                                ostname, fsname, poolname);
+                return;
+        }
+        default: {
+        }
+        }
+}
+
+static int check_and_complete_ostname(char *fsname, char *ostname)
+{
+        char *ptr;
+        char real_ostname[MAX_OBD_NAME + 1];
+        char i;
+
+        /* if OST name does not start with fsname, we add it */
+        /* if not check if the fsname is the right one */
+        ptr = strchr(ostname, '-');
+        if (ptr == NULL) {
+                sprintf(real_ostname, "%s-%s", fsname, ostname);
+        } else if (strncmp(ostname, fsname, strlen(fsname)) != 0) {
+                fprintf(stderr, "%s does not start with fsname %s\n",
+                        ostname, fsname);
+                return -EINVAL;
+        } else {
+             strcpy(real_ostname, ostname);
+        }
+        /* real_ostname is fsname-????? */
+        ptr = real_ostname + strlen(fsname) + 1;
+        if (strncmp(ptr, "OST", 3) != 0) {
+                fprintf(stderr, "%s does not start by %s-OST nor OST\n",
+                        ostname, fsname);
+                return -EINVAL;
+        }
+        /* real_ostname is fsname-OST????? */
+        ptr += 3;
+        for (i = 0; i < 4; i++) {
+                if (!isxdigit(*ptr)) {
+                        fprintf(stderr,
+                                "ost's index in %s is not an hexa number\n",
+                                ostname);
+                        return -EINVAL;
+                }
+                ptr++;
+        }
+        /* real_ostname is fsname-OSTXXXX????? */
+        /* if OST name does not end with _UUID, we add it */
+        if (*ptr == '\0') {
+                strcat(real_ostname, "_UUID");
+        } else if (strcmp(ptr, "_UUID") != 0) {
+                fprintf(stderr,
+                        "ostname %s does not end with _UUID\n", ostname);
+                return -EINVAL;
+        }
+        /* real_ostname is fsname-OSTXXXX_UUID */
+        strcpy(ostname, real_ostname);
+        return 0;
+}
+
+/* returns 0 or -errno */
+static int pool_cmd(enum lcfg_command_type cmd,
+                    char *cmdname, char *fullpoolname,
+                    char *fsname, char *poolname, char *ostname)
+{
+        int rc = 0;
+        struct obd_ioctl_data data;
+        struct lustre_cfg_bufs bufs;
+        struct lustre_cfg *lcfg;
+
+        rc = check_pool_cmd(cmd, fsname, poolname, ostname);
+        if (rc)
+                return rc;
+
+        lustre_cfg_bufs_reset(&bufs, NULL);
+        lustre_cfg_bufs_set_string(&bufs, 0, cmdname);
+        lustre_cfg_bufs_set_string(&bufs, 1, fullpoolname);
+        if (ostname != NULL)
+                lustre_cfg_bufs_set_string(&bufs, 2, ostname);
+
+        lcfg = lustre_cfg_new(cmd, &bufs);
+        if (IS_ERR(lcfg)) {
+                rc = PTR_ERR(lcfg);
+                return rc;
+        }
+
+        IOC_INIT(data);
+        rc = data.ioc_dev = get_mgs_device();
+        if (rc < 0)
+                goto out;
+
+        data.ioc_type = LUSTRE_CFG_TYPE;
+        data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount,
+                                        lcfg->lcfg_buflens);
+        data.ioc_pbuf1 = (void *)lcfg;
+        IOC_PACK(cmdname, data);
+
+        rc = l_ioctl(OBD_DEV_ID, OBD_IOC_POOL, buf);
+out:
+        if (rc)
+                rc = -errno;
+        lustre_cfg_free(lcfg);
+        return rc;
+}
+
+/*
+ * this function tranforms a rule [start-end/step] into an array
+ * of matching numbers
+ * supported forms are:
+ * [start]                : just this number
+ * [start-end]            : all numbers from start to end
+ * [start-end/step]       : numbers from start to end with increment of step
+ * on return, format contains a printf format string which can be used
+ * to generate all the strings
+ */
+static int get_array_idx(char *rule, char *format, int **array)
+{
+        char *start, *end, *ptr;
+        unsigned int lo, hi, step;
+        int array_sz = 0;
+        int i, array_idx;
+        int rc;
+
+        start = strchr(rule, '[');
+        end = strchr(rule, ']');
+        if ((start == NULL) || (end == NULL)) {
+                *array = malloc(sizeof(int));
+                if (*array == NULL)
+                        return 0;
+                strcpy(format, rule);
+                array_sz = 1;
+                return array_sz;
+        }
+        *start = '\0';
+        *end = '\0';
+        end++;
+        start++;
+        /* put in format the printf format (the rule without the range) */
+        sprintf(format, "%s%%.4d%s", rule, end);
+
+        array_idx = 0;
+        array_sz = 0;
+        *array = NULL;
+        /* loop on , separator */
+        do {
+                /* extract the 3 fields */
+                rc = sscanf(start, "%u-%u/%u", &lo, &hi, &step);
+                switch (rc) {
+                case 0: {
+                        return 0;
+                }
+                case 1: {
+                        array_sz++;
+                        *array = realloc(*array, array_sz * sizeof(int));
+                        if (*array == NULL)
+                                return 0;
+                        (*array)[array_idx] = lo;
+                        array_idx++;
+                        break;
+                }
+                case 2: {
+                        step = 1;
+                        /* do not break to share code with case 3: */
+                }
+                case 3: {
+                        if ((hi < lo) || (step == 0))
+                                return 0;
+                        array_sz += (hi - lo) / step + 1;
+                        *array = realloc(*array, sizeof(int) * array_sz);
+                        if (*array == NULL)
+                                return 0;
+                        for (i = lo; i <= hi; i+=step, array_idx++)
+                                (*array)[array_idx] = i;
+                        break;
+                }
+                }
+                ptr = strchr(start, ',');
+                if (ptr != NULL)
+                        start = ptr + 1;
+
+        } while (ptr != NULL);
+        return array_sz;
+}
+
+static int extract_fsname_poolname(char *arg, char *fsname, char *poolname)
+{
+        char *ptr;
+        int len;
+        int rc;
+
+        strcpy(fsname, arg);
+        ptr = strchr(fsname, '.');
+        if (ptr == NULL) {
+                fprintf(stderr, ". is missing in %s\n", fsname);
+                rc = -EINVAL;
+                goto err;
+        }
+
+        len = ptr - fsname;
+        if (len == 0) {
+                fprintf(stderr, "fsname is empty\n");
+                rc = -EINVAL;
+                goto err;
+        }
+
+        len = strlen(ptr + 1);
+        if (len == 0) {
+                fprintf(stderr, "poolname is empty\n");
+                rc = -EINVAL;
+                goto err;
+        }
+        if (len > MAXPOOLNAME) {
+                fprintf(stderr,
+                        "poolname %s is too long (length is %d max is %d)\n",
+                        ptr + 1, len, MAXPOOLNAME);
+                rc = -ENAMETOOLONG;
+                goto err;
+        }
+        strncpy(poolname, ptr + 1, MAXPOOLNAME);
+        poolname[MAXPOOLNAME] = '\0';
+        *ptr = '\0';
+        return 0;
+
+err:
+        fprintf(stderr, "argument %s must be <fsname>.<poolname>\n", arg);
+        return rc;
+}
+
+int jt_pool_cmd(int argc, char **argv)
+{
+        enum lcfg_command_type cmd;
+        char fsname[PATH_MAX + 1];
+        char poolname[MAXPOOLNAME + 1];
+        char *ostnames_buf = NULL;
+        int i, rc;
+        int *array = NULL, array_sz;
+        struct {
+                int     rc;
+                char   *ostname;
+        } *cmds = NULL;
+
+        switch (argc) {
+        case 0:
+        case 1: return CMD_HELP;
+        case 2: {
+                if (strcmp("pool_new", argv[0]) == 0)
+                        cmd = LCFG_POOL_NEW;
+                else if (strcmp("pool_destroy", argv[0]) == 0)
+                        cmd = LCFG_POOL_DEL;
+                else if (strcmp("pool_list", argv[0]) == 0)
+                         return llapi_poollist(argv[1]);
+                else return CMD_HELP;
+
+                rc = extract_fsname_poolname(argv[1], fsname, poolname);
+                if (rc)
+                        break;
+
+                rc = pool_cmd(cmd, argv[0], argv[1],
+                              fsname, poolname, NULL);
+                if (rc)
+                        break;
+
+                check_pool_cmd_result(cmd, fsname, poolname, NULL);
+                break;
+        }
+        default: {
+                char format[2*MAX_OBD_NAME];
+
+                if (strcmp("pool_remove", argv[0]) == 0) {
+                        cmd = LCFG_POOL_REM;
+                } else if (strcmp("pool_add", argv[0]) == 0) {
+                        cmd = LCFG_POOL_ADD;
+                } else {
+                        return CMD_HELP;
+                }
+
+                rc = extract_fsname_poolname(argv[1], fsname, poolname);
+                if (rc)
+                        break;
+
+                for (i = 2; i < argc; i++) {
+                        int j;
+
+                        array_sz = get_array_idx(argv[i], format, &array);
+                        if (array_sz == 0)
+                                return CMD_HELP;
+
+                        cmds = malloc(array_sz * sizeof(cmds[0]));
+                        if (cmds != NULL) {
+                                ostnames_buf = malloc(array_sz *
+                                                      (MAX_OBD_NAME + 1));
+                        } else {
+                                free(array);
+                                rc = -ENOMEM;
+                                goto out;
+                        }
+
+                        for (j = 0; j < array_sz; j++) {
+                                char ostname[MAX_OBD_NAME + 1];
+
+                                snprintf(ostname, MAX_OBD_NAME, format,
+                                         array[j]);
+                                ostname[MAX_OBD_NAME] = '\0';
+
+                                rc = check_and_complete_ostname(fsname,ostname);
+                                if (rc) {
+                                        free(array);
+                                        free(cmds);
+                                        if (ostnames_buf)
+                                                free(ostnames_buf);
+                                        goto out;
+                                }
+                                if (ostnames_buf != NULL) {
+                                        cmds[j].ostname =
+                                          &ostnames_buf[(MAX_OBD_NAME + 1) * j];
+                                        strcpy(cmds[j].ostname, ostname);
+                                } else {
+                                        cmds[j].ostname = NULL;
+                                }
+                                cmds[j].rc = pool_cmd(cmd, argv[0], argv[1],
+                                                      fsname, poolname,
+                                                      ostname);
+                        }
+                        for (j = 0; j < array_sz; j++) {
+                                if (!cmds[j].rc) {
+                                        char ostname[MAX_OBD_NAME + 1];
+
+                                        if (!cmds[j].ostname) {
+                                                snprintf(ostname, MAX_OBD_NAME,
+                                                         format, array[j]);
+                                                ostname[MAX_OBD_NAME] = '\0';
+                                                check_and_complete_ostname(
+                                                        fsname, ostname);
+                                        } else {
+                                                strcpy(ostname,
+                                                       cmds[j].ostname);
+                                        }
+                                        check_pool_cmd_result(cmd, fsname,
+                                                              poolname,ostname);
+                                }
+                        }
+                        if (array_sz > 0)
+                                free(array);
+                        if (cmds)
+                                free(cmds);
+                        if (ostnames_buf);
+                                free(ostnames_buf);
+                }
+                return 0;
+        }
+        }
+
+
+out:
+        if ((rc == -EINVAL) || (rc == -ENOENT))
+                fprintf(stderr, "Does the fs, pool or ost exist?\n");
+        if (rc != 0) {
+                errno = -rc;
+                perror(argv[0]);
+        }
+
+        return rc;
+}
index a5fd90a..eeb1bb8 100644 (file)
@@ -119,4 +119,6 @@ int jt_blockdev_attach(int argc, char **argv);
 int jt_blockdev_detach(int argc, char **argv);
 int jt_blockdev_info(int argc, char **argv);
 
+int jt_pool_cmd(int argc, char **argv);
+
 #endif
index ef6d35b..dc366bd 100644 (file)
@@ -60,7 +60,7 @@
 #define lustre_swab_ldlm_request NULL
 #define lustre_swab_ldlm_reply NULL
 #define lustre_swab_ldlm_intent NULL
-#define lustre_swab_lov_mds_md NULL
+/* #define lustre_swab_lov_mds_md NULL */
 #define lustre_swab_mdt_rec_reint NULL
 #define lustre_swab_lustre_capa NULL
 #define lustre_swab_lustre_capa_key NULL
index 05cdec0..e75cf6c 100644 (file)
@@ -344,6 +344,33 @@ check_lov_mds_md_join(void)
 }
 
 static void
+check_lov_mds_md_v3(void)
+{
+        BLANK_LINE();
+        CHECK_STRUCT(lov_mds_md_v3);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_magic);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_pattern);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_object_id);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_object_gr);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_stripe_size);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_stripe_count);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_pool_name);
+        CHECK_MEMBER(lov_mds_md_v3, lmm_objects);
+
+        BLANK_LINE();
+        CHECK_STRUCT(lov_ost_data_v1);
+        CHECK_MEMBER(lov_ost_data_v1, l_object_id);
+        CHECK_MEMBER(lov_ost_data_v1, l_object_gr);
+        CHECK_MEMBER(lov_ost_data_v1, l_ost_gen);
+        CHECK_MEMBER(lov_ost_data_v1, l_ost_idx);
+
+        CHECK_CDEFINE(LOV_MAGIC_V3);
+
+        CHECK_VALUE(LOV_PATTERN_RAID0);
+        CHECK_VALUE(LOV_PATTERN_RAID1);
+}
+
+static void
 check_obd_statfs(void)
 {
         BLANK_LINE();
@@ -1307,6 +1334,7 @@ main(int argc, char **argv)
         check_obd_connect_data();
         check_obdo();
         check_lov_mds_md_v1();
+        check_lov_mds_md_v3();
         check_lov_mds_md_join();
         check_obd_statfs();
         check_obd_ioobj();