From 665e36b780faa2144cecccd29a0d8a8196a76903 Mon Sep 17 00:00:00 2001 From: nathan Date: Mon, 15 Sep 2008 23:25:34 +0000 Subject: [PATCH] b=14836 i=nathan i=adilger OST pools on HEAD, comprehensive patch including 17054:19007; 16935:18918,19012,19089,19128; 16978:18872 --- lustre/ChangeLog | 4 + lustre/doc/lfs.1 | 48 ++- lustre/include/lprocfs_status.h | 18 +- lustre/include/lustre/liblustreapi.h | 16 +- lustre/include/lustre/lustre_idl.h | 28 +- lustre/include/lustre/lustre_user.h | 23 +- lustre/include/lustre_cfg.h | 6 +- lustre/include/lustre_lib.h | 1 + lustre/include/obd.h | 71 +++- lustre/include/obd_class.h | 48 +++ lustre/include/obd_lov.h | 12 +- lustre/ldlm/ldlm_lib.c | 2 +- lustre/liblustre/super.c | 26 +- lustre/llite/dir.c | 92 ++++- lustre/llite/file.c | 56 +++- lustre/llite/llite_lib.c | 8 +- lustre/lov/Makefile.in | 2 +- lustre/lov/autoMakefile.am | 3 +- lustre/lov/lov_ea.c | 112 ++++++- lustre/lov/lov_internal.h | 22 ++ lustre/lov/lov_obd.c | 55 ++- lustre/lov/lov_pack.c | 282 ++++++++++++---- lustre/lov/lov_pool.c | 619 ++++++++++++++++++++++++++++++++++ lustre/lov/lov_qos.c | 275 +++++++++------ lustre/mdd/mdd_lov.c | 21 +- lustre/mdd/mdd_trans.c | 5 +- lustre/mds/handler.c | 2 +- lustre/mds/mds_internal.h | 3 +- lustre/mds/mds_lov.c | 2 +- lustre/mgs/mgs_handler.c | 175 ++++++++-- lustre/mgs/mgs_internal.h | 3 + lustre/mgs/mgs_llog.c | 139 ++++++++ lustre/obdclass/debug.c | 5 +- lustre/obdclass/lprocfs_status.c | 46 ++- lustre/obdclass/obd_config.c | 22 ++ lustre/obdfilter/filter.c | 2 +- lustre/osc/osc_request.c | 30 +- lustre/ptlrpc/pack_generic.c | 77 ++--- lustre/ptlrpc/ptlrpc_module.c | 4 +- lustre/tests/ll_dirstripe_verify.c | 2 +- lustre/tests/ll_getstripe_info.c | 2 +- lustre/tests/llmount.sh | 1 + lustre/tests/sanity.sh | 126 ++++++- lustre/tests/test-framework.sh | 4 +- lustre/utils/Makefile.am | 4 +- lustre/utils/lctl.c | 18 + lustre/utils/lfs.c | 105 ++++-- lustre/utils/liblustreapi.c | 524 +++++++++++++++++++++++------ lustre/utils/llog_reader.c | 20 ++ lustre/utils/obd.c | 631 ++++++++++++++++++++++++++++++++++- lustre/utils/obdctl.h | 2 + lustre/utils/req-layout.c | 2 +- lustre/utils/wirecheck.c | 28 ++ 53 files changed, 3303 insertions(+), 531 deletions(-) create mode 100644 lustre/lov/lov_pool.c diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 4141ff9..4ecc54e 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -95,6 +95,10 @@ Details : When connection is reused this not moved from CONN_UNUSED_HASH again in unused hash. Severity : enhancement +Bugzilla : 15899 +Description: File striping can now be set to use an arbitrary pool of OSTs. + +Severity : enhancement Bugzilla : 16573 Description: Export bytes_read/bytes_write count on OSC/OST. diff --git a/lustre/doc/lfs.1 b/lustre/doc/lfs.1 index 0fca528..cea475f 100644 --- a/lustre/doc/lfs.1 +++ b/lustre/doc/lfs.1 @@ -17,18 +17,29 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the \fB[[!] --uid|-u N] [[!] --user|-U ] \fB\fR .br +.B lfs osts +.br .B lfs getstripe [--obd|-O ] [--quiet|-q] [--verbose|-v] - \fB[--recursive|-r] \fR + \fB[--recursive|-r] \fR .br .B lfs setstripe [--size|-s stripe-size] [--count|-c stripe-cnt] - \fB[--index|-i start-ost] \fR + \fB[--offset|-o start-ost] [--pool|-p pool-name] + \fB\fR .br -.B lfs setstripe -d +.B lfs setstripe -d .br -.B lfs quotachown [-i] +.B lfs poollist [.] | +.br +.B lfs quota [-v] [-o obd_uuid] [-u|-g] +.br +.B lfs quota +.br +.B lfs quota -t [-u|-g] .br .B lfs quotacheck [-ug] .br +.B lfs quotachown [-i] +.br .B lfs quotaon [-ugf] .br .B lfs quotaoff [-ug] @@ -59,13 +70,15 @@ Report filesystem disk space usage or inodes usage of each MDT/OST. .B find To search the directory tree rooted at the given dir/file name for the files that match the given parameters: \fB--atime\fR (file was last accessed N*24 hours ago), \fB--ctime\fR (file's status was last changed N*24 hours ago), \fB--mtime\fR (file's data was last modified N*24 hours ago), \fB--obd\fR (file has an object on a specific OST or OSTs), \fB--size\fR (file has size in bytes, or \fBk\fRilo-, \fBM\fRega-, \fBG\fRiga-, \fBT\fRera-, \fBP\fReta-, or \fBE\fRxabytes if a suffix is given), \fB--type\fR (file has the type: \fBb\fRlock, \fBc\fRharacter, \fBd\fRirectory, \fBp\fRipe, \fBf\fRile, sym\fBl\fRink, \fBs\fRocket, or \fBD\fRoor (Solaris)), \fB--uid\fR (file has specific numeric user ID), \fB--user\fR (file owned by specific user, numeric user ID allowed), \fB--gid\fR (file has specific group ID), \fB--group\fR (file belongs to specific group, numeric group ID allowed). The option \fB--maxdepth\fR allows find to decend at most N levels of directory tree. The options \fB--print\fR and \fB--print0\fR print full file name, followed by a newline or NUL character correspondingly. Using \fB!\fR before an option negates its meaning (\fIfiles NOT matching the parameter\fR). Using \fB+\fR before a numeric value means \fIfiles with the parameter OR MORE\fR, while \fB-\fR before a numeric value means \fIfiles with the parameter OR LESS\fR. .TP -.B getstripe -To list the striping info for a given filename or files in a directory, optionally recursively, for all files in a directory tree: \fB--quiet\fR (don't print object IDs), \fB--verbose\fR (print striping parameters), \fB--recursive\fR (recurse into subdirectories). -.TP .B osts List all the OSTs for the filesystem .TP -.B setstripe [--size stripe-size] [--count stripe-cnt] [--index start-ost] +.B getstripe +To list the striping info for a given filename or files in a directory, optionally recursively, for all files in a directory tree: \fB--quiet\fR (don't print object IDs), \fB--verbose\fR (print striping parameters), \fB--recursive\fR (recurse into subdirectories). +.TP +.B setstripe [--size stripe-size] [--count stripe-cnt] + \fB[--offset start-ost] [--pool pool-name]\fR +.br To create a new file, or set the directory default, with the specified striping parameters. The .I stripe-count is the number of OSTs to stripe a file over. A @@ -78,15 +91,24 @@ is the number of bytes to store on each OST before moving to the next OST. A .I stripe-size of 0 means to use the filesystem-wide default stripe size (default 1MB). The .I start-ost -is the OST index (starting at 0) on which to start striping for this file. A +is the OST index (base 10, starting at 0) on which to start striping for this file. A .I start-ost -of -1 allows the MDS to specify the starting index and it is strongly -recommended that the starting OST not be given, as this allows space and -load balancing to be done by the MDS as needed. +of -1 allows the MDS to choose the starting index and it is strongly recommended, as this allows space and load balancing to be done by the MDS as needed. The +.I pool-name +is the name of a predefined pool of OSTs (see +.I lctl +) that will be used for striping. The +.I stripe-count, stripe-size, start-ost +will be used as well; the +.I start-ost +must be part of the pool or an error will be returned. .TP -.B lfs setstripe -d +.B setstripe -d Delete the default striping on the specified directory. .TP +.B poollist [.] | +List the pools in \fBfilesystem\fR or \fBpathname\fR, or the OSTs in \fBfilesystem.pool\fR +.TP .B quotachown To change files' owner and group on OSTs of the specified filesystem .TP diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index c3e69c7..9696a7b 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -403,9 +403,12 @@ extern int lprocfs_add_clear_entry(struct obd_device * obd, extern int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *peer_nid, int *newnid); extern int lprocfs_exp_cleanup(struct obd_export *exp); -extern int lprocfs_add_simple(struct proc_dir_entry *root, - char *name, cfs_read_proc_t *read_proc, - cfs_write_proc_t *write_proc, void *data); +extern cfs_proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root, + char *name, + cfs_read_proc_t *read_proc, + cfs_write_proc_t *write_proc, + void *data, + struct file_operations *fops); extern struct proc_dir_entry *lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, const char *dest); extern void lprocfs_free_per_client_stats(struct obd_device *obd); @@ -436,10 +439,6 @@ extern cfs_proc_dir_entry_t *lprocfs_srch(cfs_proc_dir_entry_t *root, extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list); extern int lprocfs_obd_cleanup(struct obd_device *obd); -extern int lprocfs_add_simple(struct proc_dir_entry *root, char *name, - cfs_read_proc_t *read_proc, - cfs_write_proc_t *write_proc, - void *data); extern void lprocfs_free_per_client_stats(struct obd_device *obd); extern struct file_operations lprocfs_evict_client_fops; @@ -658,11 +657,12 @@ static inline int lprocfs_exp_setup(struct obd_export *exp, { return 0; } static inline int lprocfs_exp_cleanup(struct obd_export *exp) { return 0; } -static inline int lprocfs_add_simple(struct proc_dir_entry *root, +static inline cfs_proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root, char *name, cfs_read_proc_t *read_proc, cfs_write_proc_t *write_proc, - void *data) + void *data, + struct file_operations *fops) {return 0; } static inline struct proc_dir_entry *lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, const char *dest) diff --git a/lustre/include/lustre/liblustreapi.h b/lustre/include/lustre/liblustreapi.h index 24857e4..e6c1e43 100644 --- a/lustre/include/lustre/liblustreapi.h +++ b/lustre/include/lustre/liblustreapi.h @@ -73,6 +73,14 @@ extern int llapi_file_create(const char *name, unsigned long stripe_size, extern int llapi_file_open(const char *name, int flags, int mode, unsigned long stripe_size, int stripe_offset, int stripe_count, int stripe_pattern); +extern int llapi_file_create_pool(const char *name, unsigned long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern, char *pool_name); +extern int llapi_file_open_pool(const char *name, int flags, int mode, + unsigned long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, + char *pool_name); +extern int llapi_poollist(char *name); extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum); #define HAVE_LLAPI_FILE_LOOKUP extern int llapi_file_lookup(int dirfd, const char *name); @@ -102,7 +110,9 @@ struct find_param { exclude_gid:1, exclude_uid:1, check_gid:1, - check_uid:1; + check_uid:1, + check_pool:1, + exclude_pool:1; int verbose; int quiet; @@ -124,6 +134,8 @@ struct find_param { /* In-precess parameters. */ unsigned int depth; dev_t st_dev; + + char poolname[MAXPOOLNAME+1]; }; extern int llapi_getstripe(char *path, struct find_param *param); @@ -136,7 +148,7 @@ extern int llapi_ping(char *obd_type, char *obd_name); extern int llapi_target_check(int num_types, char **obd_types, char *dir); extern int llapi_catinfo(char *dir, char *keyword, char *node_name); extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid); -extern int llapi_file_get_lov_fuuid(int fd, struct obd_uuid *lov_uuid); +extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid); extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count); extern int llapi_is_lustre_mnttype(const char *type); extern int parse_size(char *optarg, unsigned long long *size, diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 4f09b84..942ec44 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -607,6 +607,7 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); *b=10600 */ #define OBD_CONNECT_CKSUM 0x20000000ULL /* support several cksum algos */ #define OBD_CONNECT_FID 0x40000000ULL /* FID is supported by server */ +#define OBD_CONNECT_LOV_V3 0x100000000ULL /* client supports lov v3 ea */ /* also update obd_connect_names[] for lprocfs_rd_connect_flags() * and lustre/utils/wirecheck.c */ @@ -626,7 +627,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | \ OBD_CONNECT_MDS_MDS | OBD_CONNECT_CANCELSET | \ OBD_CONNECT_FID | \ - LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_AT) + LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_AT | \ + OBD_CONNECT_LOV_V3) #define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ @@ -748,6 +750,7 @@ typedef __u32 obd_count; #define LOV_MAGIC_V1 0x0BD10BD0 #define LOV_MAGIC LOV_MAGIC_V1 #define LOV_MAGIC_JOIN 0x0BD20BD0 +#define LOV_MAGIC_V3 0x0BD30BD0 #define LOV_PATTERN_RAID0 0x001 /* stripes are used round-robin */ #define LOV_PATTERN_RAID1 0x002 /* stripes are mirrors of each other */ @@ -757,6 +760,9 @@ typedef __u32 obd_count; #define LOV_OBJECT_GROUP_DEFAULT ~0ULL #define LOV_OBJECT_GROUP_CLEAR 0ULL +#define MAXPOOLNAME 16 +#define POOLNAMEF "%.16s" + #define lov_ost_data lov_ost_data_v1 struct lov_ost_data_v1 { /* per-stripe data structure (little-endian)*/ __u64 l_object_id; /* OST object ID */ @@ -776,7 +782,7 @@ struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */ struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ }; -extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); +/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */ #define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data)) #define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data)) @@ -785,6 +791,18 @@ extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); #define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default" #define XATTR_NAME_LOV "trusted.lov" +struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + __u64 lmm_object_id; /* LOV object ID */ + __u64 lmm_object_gr; /* LOV object group */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u32 lmm_stripe_count; /* num stripes in use for this object */ + char lmm_pool_name[MAXPOOLNAME]; /* must be 32bit aligned */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + + #define OBD_MD_FLID (0x00000001ULL) /* object ID */ #define OBD_MD_FLATIME (0x00000002ULL) /* access time */ #define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */ @@ -2151,8 +2169,10 @@ extern void lustre_swab_ost_body (struct ost_body *b); extern void lustre_swab_ost_last_id(obd_id *id); extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap); -extern void lustre_swab_lov_user_md(struct lov_user_md *lum); -extern void lustre_swab_lov_user_md_objects(struct lov_user_md *lum); +extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum); +extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum); +extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count); extern void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj); /* llog_swab.c */ diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index 2fd8bc5..ed40b0e 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -129,13 +129,15 @@ struct obd_statfs; #define LOV_USER_MAGIC_V1 0x0BD10BD0 #define LOV_USER_MAGIC LOV_USER_MAGIC_V1 - #define LOV_USER_MAGIC_JOIN 0x0BD20BD0 +#define LOV_USER_MAGIC_V3 0x0BD30BD0 #define LOV_PATTERN_RAID0 0x001 #define LOV_PATTERN_RAID1 0x002 #define LOV_PATTERN_FIRST 0x100 +#define MAXPOOLNAME 16 + #define lov_user_ost_data lov_user_ost_data_v1 struct lov_user_ost_data_v1 { /* per-stripe data structure */ __u64 l_object_id; /* OST object ID */ @@ -156,6 +158,18 @@ struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ } __attribute__((packed)); +struct lov_user_md_v3 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + __u64 lmm_object_id; /* LOV object ID */ + __u64 lmm_object_gr; /* LOV object group */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_stripe_offset; /* starting stripe offset in lmm_objects */ + char lmm_pool_name[MAXPOOLNAME]; /* pool name */ + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed)); + /* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to * use this. It is unsafe to #define those values in this header as it * is possible the application has already #included . */ @@ -163,7 +177,12 @@ struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ #define lov_user_mds_data lov_user_mds_data_v1 struct lov_user_mds_data_v1 { lstat_t lmd_st; /* MDS stat struct */ - struct lov_user_md_v1 lmd_lmm; /* LOV EA user data */ + struct lov_user_md_v1 lmd_lmm; /* LOV EA V1 user data */ +} __attribute__((packed)); + +struct lov_user_mds_data_v3 { + lstat_t lmd_st; /* MDS stat struct */ + struct lov_user_md_v3 lmd_lmm; /* LOV EA V3 user data */ } __attribute__((packed)); #endif diff --git a/lustre/include/lustre_cfg.h b/lustre/include/lustre_cfg.h index 266d6f5..e52a9f3 100644 --- a/lustre/include/lustre_cfg.h +++ b/lustre/include/lustre_cfg.h @@ -73,6 +73,10 @@ enum lcfg_command_type { LCFG_ADD_MDC = 0x00cf014, LCFG_DEL_MDC = 0x00cf015, LCFG_SPTLRPC_CONF = 0x00ce016, + LCFG_POOL_NEW = 0x00ce020, + LCFG_POOL_ADD = 0x00ce021, + LCFG_POOL_REM = 0x00ce022, + LCFG_POOL_DEL = 0x00ce023, }; struct lustre_cfg_bufs { @@ -222,7 +226,7 @@ static inline struct lustre_cfg *lustre_cfg_new(int cmd, OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen)); if (!lcfg) - RETURN(lcfg); + RETURN(ERR_PTR(-ENOMEM)); lcfg->lcfg_version = LUSTRE_CFG_VERSION; lcfg->lcfg_command = cmd; diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index 0a8eb9a..83697fe 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -498,6 +498,7 @@ static inline void obd_ioctl_freedata(char *buf, int len) #define OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) #define OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) #define OBD_IOC_PARAM _IOW ('f', 187, OBD_IOC_DATA_TYPE) +#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE) #define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) #define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 109c9af..81cd3d8 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -150,6 +150,7 @@ struct lov_stripe_md { __u32 lw_stripe_size; /* size of the stripe */ __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */ unsigned lw_stripe_count; /* number of objects being striped over */ + char lw_pool_name[MAXPOOLNAME]; /* pool name */ } lsm_wire; struct lov_array_info *lsm_array; /*Only for joined file array info*/ @@ -163,6 +164,7 @@ struct lov_stripe_md { #define lsm_stripe_size lsm_wire.lw_stripe_size #define lsm_pattern lsm_wire.lw_pattern #define lsm_stripe_count lsm_wire.lw_stripe_count +#define lsm_pool_name lsm_wire.lw_pool_name struct obd_info; @@ -649,15 +651,32 @@ struct ltd_qos { unsigned int ltq_usable:1; /* usable for striping */ }; +/* Generic subset of OSTs */ +struct ost_pool { + __u32 *op_array; /* array of index of + lov_obd->lov_tgts */ + unsigned int op_count; /* number of OSTs in the array */ + unsigned int op_size; /* allocated size of lp_array */ + rwlock_t op_rwlock; /* to protect lov_pool use */ +}; + +/* Round-robin allocator data */ +struct lov_qos_rr { + __u32 lqr_start_idx; /* start index of new inode */ + __u32 lqr_offset_idx; /* aliasing for start_idx */ + int lqr_start_count; /* reseed counter */ + struct ost_pool lqr_pool; /* round-robin optimized list */ + unsigned long lqr_dirty:1; /* recalc round-robin list */ +}; + +/* Stripe placement optimization */ struct lov_qos { struct list_head lq_oss_list; /* list of OSSs that targets use */ struct rw_semaphore lq_rw_sem; __u32 lq_active_oss_count; - __u32 *lq_rr_array; /* round-robin optimized list */ - unsigned int lq_rr_size; /* rr array size */ unsigned int lq_prio_free; /* priority for free space */ + struct lov_qos_rr lq_rr; /* round robin qos data */ unsigned long lq_dirty:1, /* recalc qos data */ - lq_dirty_rr:1, /* recalc round-robin list */ lq_same_space:1,/* the ost's all have approx. the same space avail */ lq_reset:1; /* zero current penalties */ @@ -674,9 +693,29 @@ struct lov_tgt_desc { ltd_reap:1; /* should this target be deleted */ }; +/* Pool metadata */ +#define pool_tgt_size(_p) _p->pool_obds.op_size +#define pool_tgt_count(_p) _p->pool_obds.op_count +#define pool_tgt_array(_p) _p->pool_obds.op_array +#define pool_tgt_rwlock(_p) _p->pool_obds.op_rwlock +#define pool_tgt(_p, _i) _p->pool_lov->lov_tgts[_p->pool_obds.op_array[_i]] + +struct pool_desc { + char pool_name[MAXPOOLNAME + 1]; /* name of pool */ + struct ost_pool pool_obds; /* pool members */ + struct lov_qos_rr pool_rr; /* round robin qos */ + struct hlist_node pool_hash; /* access by poolname */ + struct list_head pool_list; /* serial access */ + cfs_proc_dir_entry_t *pool_proc_entry; /* file in /proc */ + struct lov_obd *pool_lov; /* lov obd to which this + pool belong */ +}; + struct lov_obd { struct lov_desc desc; - struct lov_tgt_desc **lov_tgts; + struct lov_tgt_desc **lov_tgts; /* sparse array */ + struct ost_pool lov_packed; /* all OSTs in a packed + array */ struct semaphore lov_lock; struct obd_connect_data lov_ocd; struct lov_qos lov_qos; /* qos info per lov */ @@ -685,13 +724,14 @@ struct lov_obd { __u32 lov_active_tgt_count; /* how many active */ __u32 lov_death_row;/* tgts scheduled to be deleted */ __u32 lov_tgt_size; /* size of tgts array */ - __u32 lov_start_idx; /* start index of new inode */ - __u32 lov_offset_idx; /* aliasing for start_idx */ - int lov_start_count;/* reseed counter */ int lov_connects; obd_page_removal_cb_t lov_page_removal_cb; obd_pin_extent_cb lov_page_pin_cb; obd_lock_cancel_cb lov_lock_cancel_cb; + int lov_pool_count; + lustre_hash_t *lov_pools_hash_body; /* used for key access */ + struct list_head lov_pool_list; /* used for sequential access */ + cfs_proc_dir_entry_t *lov_pool_proc_entry; }; struct lmv_tgt_desc { @@ -1340,7 +1380,13 @@ struct obd_ops { obd_lock_cancel_cb cb); int (*o_unregister_lock_cancel_cb)(struct obd_export *exp, obd_lock_cancel_cb cb); - + /* pools methods */ + int (*o_pool_new)(struct obd_device *obd, char *poolname); + int (*o_pool_del)(struct obd_device *obd, char *poolname); + int (*o_pool_add)(struct obd_device *obd, char *poolname, + char *ostname); + int (*o_pool_rem)(struct obd_device *obd, char *poolname, + char *ostname); /* * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. @@ -1511,15 +1557,18 @@ struct lsm_operations { struct lov_mds_md *lmm); }; -extern struct lsm_operations lsm_plain_ops; +extern struct lsm_operations lsm_v1_ops; extern struct lsm_operations lsm_join_ops; +extern struct lsm_operations lsm_v3_ops; static inline struct lsm_operations *lsm_op_find(int magic) { switch(magic) { - case LOV_MAGIC: - return &lsm_plain_ops; + case LOV_MAGIC_V1: + return &lsm_v1_ops; case LOV_MAGIC_JOIN: return &lsm_join_ops; + case LOV_MAGIC_V3: + return &lsm_v3_ops; default: CERROR("Cannot recognize lsm_magic %d\n", magic); return NULL; diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index 8a73f27..fa44819 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -946,6 +946,54 @@ static inline int obd_ping(struct obd_export *exp) RETURN(rc); } +static inline int obd_pool_new(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_new); + + rc = OBP(obd, pool_new)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_del(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_del); + + rc = OBP(obd, pool_del)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_add); + + rc = OBP(obd, pool_add)(obd, poolname, ostname); + RETURN(rc); +} + +static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_rem); + + rc = OBP(obd, pool_rem)(obd, poolname, ostname); + RETURN(rc); +} + static inline int obd_init_export(struct obd_export *exp) { int rc = 0; diff --git a/lustre/include/obd_lov.h b/lustre/include/obd_lov.h index 64798ab..da3ca51 100644 --- a/lustre/include/obd_lov.h +++ b/lustre/include/obd_lov.h @@ -42,13 +42,17 @@ static inline int lov_stripe_md_size(int stripes) return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*); } -#define lov_mds_md_size(stripes) lov_mds_md_v1_size(stripes) -static inline int lov_mds_md_v1_size(int stripes) +static inline int lov_mds_md_size(int stripes, int lmm_magic) { - return sizeof(struct lov_mds_md_v1) + - stripes * sizeof(struct lov_ost_data_v1); + if (lmm_magic == LOV_MAGIC_V3) + return sizeof(struct lov_mds_md_v3) + + stripes * sizeof(struct lov_ost_data_v1); + else + return sizeof(struct lov_mds_md_v1) + + stripes * sizeof(struct lov_ost_data_v1); } + #define IOC_LOV_TYPE 'g' #define IOC_LOV_MIN_NR 50 #define IOC_LOV_SET_OSC_ACTIVE _IOWR('g', 50, long) diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 3978b71..27cd186 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -345,7 +345,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) cli->cl_import = imp; /* cli->cl_max_mds_{easize,cookiesize} updated by mdc_init_ea_size() */ - cli->cl_max_mds_easize = sizeof(struct lov_mds_md); + cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3); cli->cl_max_mds_cookiesize = sizeof(struct llog_cookie); if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 552a586..d7d37cf 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -1730,11 +1730,25 @@ static int llu_lov_dir_setstripe(struct inode *ino, unsigned long arg) if (rc) return(-EFAULT); - if (lum.lmm_magic != LOV_USER_MAGIC) + switch (lum.lmm_magic) { + case LOV_USER_MAGIC_V1: { + if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) + lustre_swab_lov_user_md_v1(&lum); + break; + } + case LOV_USER_MAGIC_V3: { + if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3)) + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)&lum); + break; + } + default: { + CDEBUG(D_IOCTL, "bad userland LOV MAGIC:" + " %#08x != %#08x nor %#08x\n", + lum.lmm_magic, LOV_USER_MAGIC_V1, + LOV_USER_MAGIC_V3); RETURN(-EINVAL); - - if (lum.lmm_magic != cpu_to_le32(LOV_USER_MAGIC)) - lustre_swab_lov_user_md(&lum); + } + } /* swabbing is done in lov_setstripe() on server side */ rc = md_setattr(sbi->ll_md_exp, &op_data, &lum, @@ -1968,7 +1982,9 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md) static int llu_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp) { - struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC }; + /* even if default lov is LOV_MAGIC_V1 we use LOV_MAGIC_V3 + * to be sure buffer are large enough */ + struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 }; __u32 valsize = sizeof(struct lov_desc); int rc, easize, def_easize, cookiesize; struct lov_desc desc; diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 55fa6a8..930bd26 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -555,17 +555,34 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, struct lustre_sb_info *lsi = s2lsi(inode->i_sb); struct obd_device *mgc = lsi->lsi_mgc; char *fsname = NULL, *param = NULL; + int lum_size; /* * This is coming from userspace, so should be in * local endian. But the MDS would like it in little * endian, so we swab it before we send it. */ - if (lump->lmm_magic != LOV_USER_MAGIC) + switch (lump->lmm_magic) { + case LOV_USER_MAGIC_V1: { + if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) + lustre_swab_lov_user_md_v1(lump); + lum_size = sizeof(struct lov_user_md_v1); + break; + } + case LOV_USER_MAGIC_V3: { + if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3)) + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lump); + lum_size = sizeof(struct lov_user_md_v3); + break; + } + default: { + CDEBUG(D_IOCTL, "bad userland LOV MAGIC:" + " %#08x != %#08x nor %#08x\n", + lump->lmm_magic, LOV_USER_MAGIC_V1, + LOV_USER_MAGIC_V3); RETURN(-EINVAL); - - if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC)) - lustre_swab_lov_user_md(lump); + } + } op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, LUSTRE_OPC_ANY, NULL); @@ -573,7 +590,7 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, RETURN(PTR_ERR(op_data)); /* swabbing is done in lov_setstripe() on server side */ - rc = md_setattr(sbi->ll_md_exp, op_data, lump, sizeof(*lump), + rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, NULL, 0, &req, NULL); ll_finish_md_op_data(op_data); ptlrpc_req_finished(req); @@ -582,6 +599,9 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, CERROR("mdc_setattr fails: rc = %d\n", rc); } + /* In the following we use the fact that LOV_USER_MAGIC_V1 and + LOV_USER_MAGIC_V3 have the same initial fields so we do not + need the make the distiction between the 2 versions */ if (set_default && mgc->u.cli.cl_mgc_mgsexp) { OBD_ALLOC(param, MGS_PARAM_MAXLEN); @@ -661,8 +681,19 @@ int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, * little endian. We convert it to host endian before * passing it to userspace. */ - if (lmm->lmm_magic == __swab32(LOV_MAGIC)) { - lustre_swab_lov_user_md((struct lov_user_md *)lmm); + /* We don't swab objects for directories */ + switch (le32_to_cpu(lmm->lmm_magic)) { + case LOV_MAGIC_V1: + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); + break; + case LOV_MAGIC_V3: + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); + break; + default: + CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic); + rc = -EPROTO; } out: *lmmp = lmm; @@ -737,21 +768,33 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, return rc; } case LL_IOC_LOV_SETSTRIPE: { - struct lov_user_md lum, *lump = (struct lov_user_md *)arg; + struct lov_user_md_v3 lumv3; + struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; + struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg; + struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg; + int rc = 0; int set_default = 0; - LASSERT(sizeof(lum) == sizeof(*lump)); - LASSERT(sizeof(lum.lmm_objects[0]) == - sizeof(lump->lmm_objects[0])); - rc = copy_from_user(&lum, lump, sizeof(lum)); + LASSERT(sizeof(lumv3) == sizeof(*lumv3p)); + LASSERT(sizeof(lumv3.lmm_objects[0]) == + sizeof(lumv3p->lmm_objects[0])); + /* first try with v1 which is smaller than v3 */ + rc = copy_from_user(lumv1, lumv1p, sizeof(*lumv1)); if (rc) RETURN(-EFAULT); + if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) { + rc = copy_from_user(&lumv3, lumv3p, sizeof(lumv3)); + if (rc) + RETURN(-EFAULT); + } + if (inode->i_sb->s_root == file->f_dentry) set_default = 1; - rc = ll_dir_setstripe(inode, &lum, set_default); + /* in v1 and v3 cases lumv1 points to data */ + rc = ll_dir_setstripe(inode, lumv1, set_default); RETURN(rc); } @@ -863,6 +906,29 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, if (rc) GOTO(free_lmm, rc = -EFAULT); + switch (lmm->lmm_magic) { + case LOV_USER_MAGIC_V1: + if (LOV_USER_MAGIC_V1 == cpu_to_le32(LOV_USER_MAGIC_V1)) + break; + /* swab objects first so that stripes num will be sane */ + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v1 *)lmm)->lmm_objects, + ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count); + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); + break; + case LOV_USER_MAGIC_V3: + if (LOV_USER_MAGIC_V3 == cpu_to_le32(LOV_USER_MAGIC_V3)) + break; + /* swab objects first so that stripes num will be sane */ + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v3 *)lmm)->lmm_objects, + ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count); + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); + break; + default: + GOTO(free_lmm, rc = -EINVAL); + } + rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize); if (rc < 0) GOTO(free_lmm, rc = -ENOMEM); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 438593b..96961fc 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -2091,16 +2091,35 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize); LASSERT(lmm != NULL); + if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) && + (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) && + (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) { + GOTO(out, rc = -EPROTO); + } + /* * This is coming from the MDS, so is probably in * little endian. We convert it to host endian before * passing it to userspace. */ - if (lmm->lmm_magic == __swab32(LOV_MAGIC)) { - lustre_swab_lov_user_md((struct lov_user_md *)lmm); - lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm); - } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) { - lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm); + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) { + /* if function called for directory - we should + * avoid swab not existent lsm objects */ + if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) { + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); + if (S_ISREG(body->mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v1 *)lmm)->lmm_objects, + ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count); + } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { + lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); + if (S_ISREG(body->mode)) + lustre_swab_lov_user_md_objects( + ((struct lov_user_md_v3 *)lmm)->lmm_objects, + ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count); + } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) { + lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm); + } } if (lmm->lmm_magic == LOV_MAGIC_JOIN) { @@ -2193,23 +2212,34 @@ static int ll_lov_setea(struct inode *inode, struct file *file, static int ll_lov_setstripe(struct inode *inode, struct file *file, unsigned long arg) { - struct lov_user_md lum, *lump = (struct lov_user_md *)arg; + struct lov_user_md_v3 lumv3; + struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; + struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg; + struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg; + int lum_size; int rc; int flags = FMODE_WRITE; ENTRY; - /* Bug 1152: copy properly when this is no longer true */ - LASSERT(sizeof(lum) == sizeof(*lump)); - LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0])); - rc = copy_from_user(&lum, lump, sizeof(lum)); + /* first try with v1 which is smaller than v3 */ + lum_size = sizeof(struct lov_user_md_v1); + rc = copy_from_user(lumv1, lumv1p, lum_size); if (rc) RETURN(-EFAULT); - rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum)); + if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) { + lum_size = sizeof(struct lov_user_md_v3); + rc = copy_from_user(&lumv3, lumv3p, lum_size); + if (rc) + RETURN(-EFAULT); + } + + rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size); if (rc == 0) { - put_user(0, &lump->lmm_stripe_count); + put_user(0, &lumv1p->lmm_stripe_count); rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), - 0, ll_i2info(inode)->lli_smd, lump); + 0, ll_i2info(inode)->lli_smd, + (void *)arg); } RETURN(rc); } diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index b42fb5a..257516f 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -248,7 +248,7 @@ static struct dentry_operations ll_d_root_ops = { * calculate this (via a call into the LOV + OSCs) each time we make an RPC. */ static int ll_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp) { - struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC }; + struct lov_stripe_md lsm = { .lsm_magic = LOV_MAGIC_V3 }; __u32 valsize = sizeof(struct lov_desc); int rc, easize, def_easize, cookiesize; struct lov_desc desc; @@ -316,7 +316,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID | OBD_CONNECT_VERSION | OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET| - OBD_CONNECT_FID | OBD_CONNECT_AT; + OBD_CONNECT_FID | OBD_CONNECT_AT | + OBD_CONNECT_LOV_V3; #ifdef HAVE_LRU_RESIZE_SUPPORT if (sbi->ll_flags & LL_SBI_LRU_RESIZE) @@ -1802,7 +1803,8 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); if (lsm != NULL) { if (lli->lli_smd == NULL) { - if (lsm->lsm_magic != LOV_MAGIC && + if (lsm->lsm_magic != LOV_MAGIC_V1 && + lsm->lsm_magic != LOV_MAGIC_V3 && lsm->lsm_magic != LOV_MAGIC_JOIN) { dump_lsm(D_ERROR, lsm); LBUG(); diff --git a/lustre/lov/Makefile.in b/lustre/lov/Makefile.in index f714192..0f223f8 100644 --- a/lustre/lov/Makefile.in +++ b/lustre/lov/Makefile.in @@ -1,4 +1,4 @@ MODULES := lov -lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o +lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o lov_pool.o @INCLUDE_RULES@ diff --git a/lustre/lov/autoMakefile.am b/lustre/lov/autoMakefile.am index 8c3af02..c65e095 100644 --- a/lustre/lov/autoMakefile.am +++ b/lustre/lov/autoMakefile.am @@ -36,7 +36,7 @@ if LIBLUSTRE noinst_LIBRARIES = liblov.a -liblov_a_SOURCES = lov_log.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h +liblov_a_SOURCES = lov_log.c lov_pool.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h liblov_a_CPPFLAGS = $(LLCPPFLAGS) liblov_a_CFLAGS = $(LLCFLAGS) endif @@ -51,6 +51,7 @@ macos_PROGRAMS := lov lov_SOURCES := \ lov_log.c \ + lov_pool.c \ lov_obd.c \ lov_pack.c \ lov_request.c \ diff --git a/lustre/lov/lov_ea.c b/lustre/lov/lov_ea.c index 8167469..1ea9d70 100755 --- a/lustre/lov/lov_ea.c +++ b/lustre/lov/lov_ea.c @@ -68,19 +68,19 @@ static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes, if (stripe_count == 0 || stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { CERROR("bad stripe count %d\n", stripe_count); - lov_dump_lmm_v1(D_WARNING, lmm); + lov_dump_lmm(D_WARNING, lmm); return -EINVAL; } - + if (lmm->lmm_object_id == 0) { CERROR("zero object id\n"); - lov_dump_lmm_v1(D_WARNING, lmm); + lov_dump_lmm(D_WARNING, lmm); return -EINVAL; } - + if (lmm->lmm_pattern != cpu_to_le32(LOV_PATTERN_RAID0)) { CERROR("bad striping pattern\n"); - lov_dump_lmm_v1(D_WARNING, lmm); + lov_dump_lmm(D_WARNING, lmm); return -EINVAL; } @@ -90,7 +90,7 @@ static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes, 0xffffffff)) { CERROR("bad stripe size %u\n", le32_to_cpu(lmm->lmm_stripe_size)); - lov_dump_lmm_v1(D_WARNING, lmm); + lov_dump_lmm(D_WARNING, lmm); return -EINVAL; } return 0; @@ -118,6 +118,7 @@ struct lov_stripe_md *lsm_alloc_plain(int stripe_count, int *size) lsm->lsm_oinfo[i] = loi; } lsm->lsm_stripe_count = stripe_count; + lsm->lsm_pool_name[0] = '\0'; return lsm; err: @@ -142,10 +143,15 @@ void lsm_free_plain(struct lov_stripe_md *lsm) static void lsm_unpackmd_common(struct lov_stripe_md *lsm, struct lov_mds_md *lmm) { + /* + * This supposes lov_mds_md_v1/v3 first fields are + * are the same + */ lsm->lsm_object_id = le64_to_cpu(lmm->lmm_object_id); lsm->lsm_object_gr = le64_to_cpu(lmm->lmm_object_gr); lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern); + lsm->lsm_pool_name[0] = '\0'; } static void @@ -197,20 +203,20 @@ static int lsm_destroy_plain(struct lov_stripe_md *lsm, struct obdo *oa, return 0; } -static int lsm_lmm_verify_plain(struct lov_mds_md *lmm, int lmm_bytes, +static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes, int *stripe_count) { if (lmm_bytes < sizeof(*lmm)) { - CERROR("lov_mds_md too small: %d, need at least %d\n", + CERROR("lov_mds_md_v1 too small: %d, need at least %d\n", lmm_bytes, (int)sizeof(*lmm)); return -EINVAL; } *stripe_count = le32_to_cpu(lmm->lmm_stripe_count); - if (lmm_bytes < lov_mds_md_v1_size(*stripe_count)) { - CERROR("LOV EA too small: %d, need %d\n", - lmm_bytes, lov_mds_md_v1_size(*stripe_count)); + if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) { + CERROR("LOV EA V1 too small: %d, need %d\n", + lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)); lov_dump_lmm_v1(D_WARNING, lmm); return -EINVAL; } @@ -218,7 +224,7 @@ static int lsm_lmm_verify_plain(struct lov_mds_md *lmm, int lmm_bytes, return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count); } -int lsm_unpackmd_plain(struct lov_obd *lov, struct lov_stripe_md *lsm, +int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm, struct lov_mds_md_v1 *lmm) { struct lov_oinfo *loi; @@ -249,7 +255,7 @@ int lsm_unpackmd_plain(struct lov_obd *lov, struct lov_stripe_md *lsm, return 0; } -struct lsm_operations lsm_plain_ops = { +struct lsm_operations lsm_v1_ops = { .lsm_free = lsm_free_plain, .lsm_destroy = lsm_destroy_plain, .lsm_stripe_by_index = lsm_stripe_by_index_plain, @@ -258,8 +264,8 @@ struct lsm_operations lsm_plain_ops = { .lsm_stripe_offset_by_index = lsm_stripe_offset_by_index_plain, .lsm_stripe_offset_by_offset = lsm_stripe_offset_by_offset_plain, .lsm_stripe_index_by_offset = lsm_stripe_index_by_offset_plain, - .lsm_lmm_verify = lsm_lmm_verify_plain, - .lsm_unpackmd = lsm_unpackmd_plain, + .lsm_lmm_verify = lsm_lmm_verify_v1, + .lsm_unpackmd = lsm_unpackmd_v1, }; struct lov_extent *lovea_off2le(struct lov_stripe_md *lsm, obd_off lov_off) @@ -625,3 +631,79 @@ struct lsm_operations lsm_join_ops = { .lsm_lmm_verify = lsm_lmm_verify_join, .lsm_unpackmd = lsm_unpackmd_join, }; + + +static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes, + int *stripe_count) +{ + struct lov_mds_md_v3 *lmm; + + lmm = (struct lov_mds_md_v3 *)lmmv1; + + if (lmm_bytes < sizeof(*lmm)) { + CERROR("lov_mds_md_v3 too small: %d, need at least %d\n", + lmm_bytes, (int)sizeof(*lmm)); + return -EINVAL; + } + + *stripe_count = le32_to_cpu(lmm->lmm_stripe_count); + + if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) { + CERROR("LOV EA V3 too small: %d, need %d\n", + lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)); + lov_dump_lmm_v3(D_WARNING, lmm); + return -EINVAL; + } + + return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes, + *stripe_count); +} + +int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md *lmmv1) +{ + struct lov_mds_md_v3 *lmm; + struct lov_oinfo *loi; + int i; + + lmm = (struct lov_mds_md_v3 *)lmmv1; + + lsm_unpackmd_common(lsm, (struct lov_mds_md_v1 *)lmm); + strncpy(lsm->lsm_pool_name, lmm->lmm_pool_name, MAXPOOLNAME); + + for (i = 0; i < lsm->lsm_stripe_count; i++) { + /* XXX LOV STACKING call down to osc_unpackmd() */ + loi = lsm->lsm_oinfo[i]; + loi->loi_id = le64_to_cpu(lmm->lmm_objects[i].l_object_id); + loi->loi_gr = le64_to_cpu(lmm->lmm_objects[i].l_object_gr); + loi->loi_ost_idx = le32_to_cpu(lmm->lmm_objects[i].l_ost_idx); + loi->loi_ost_gen = le32_to_cpu(lmm->lmm_objects[i].l_ost_gen); + if (loi->loi_ost_idx >= lov->desc.ld_tgt_count) { + CERROR("OST index %d more than OST count %d\n", + loi->loi_ost_idx, lov->desc.ld_tgt_count); + lov_dump_lmm_v3(D_WARNING, lmm); + return -EINVAL; + } + if (!lov->lov_tgts[loi->loi_ost_idx]) { + CERROR("OST index %d missing\n", loi->loi_ost_idx); + lov_dump_lmm_v3(D_WARNING, lmm); + return -EINVAL; + } + } + + return 0; +} + +struct lsm_operations lsm_v3_ops = { + .lsm_free = lsm_free_plain, + .lsm_destroy = lsm_destroy_plain, + .lsm_stripe_by_index = lsm_stripe_by_index_plain, + .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, + .lsm_revalidate = lsm_revalidate_plain, + .lsm_stripe_offset_by_index = lsm_stripe_offset_by_index_plain, + .lsm_stripe_offset_by_offset = lsm_stripe_offset_by_offset_plain, + .lsm_stripe_index_by_offset = lsm_stripe_index_by_offset_plain, + .lsm_lmm_verify = lsm_lmm_verify_v3, + .lsm_unpackmd = lsm_unpackmd_v3, +}; + diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index 77154c7..9a1d66d 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -285,6 +285,9 @@ void lov_free_memmd(struct lov_stripe_md **lsmp); void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm); void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj); +void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm); +void lov_dump_lmm(int level, void *lmm); + /* lov_ea.c */ int lov_unpackmd_join(struct lov_obd *lov, struct lov_stripe_md *lsm, struct lov_mds_md *lmm); @@ -306,4 +309,23 @@ static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars) } #endif +/* pools */ +extern lustre_hash_ops_t pool_hash_operations; +/* ost_pool methods */ +int lov_ost_pool_init(struct ost_pool *op, unsigned int count); +int lov_ost_pool_extend(struct ost_pool *op, unsigned int max_count); +int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int max_count); +int lov_ost_pool_remove(struct ost_pool *op, __u32 idx); +int lov_ost_pool_free(struct ost_pool *op); + +/* high level pool methods */ +int lov_pool_new(struct obd_device *obd, char *poolname); +int lov_pool_del(struct obd_device *obd, char *poolname); +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname); +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname); +void lov_dump_pool(int level, struct pool_desc *pool); +struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname); +int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool); + + #endif diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 739fee3..2456372 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -667,7 +667,6 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, lov->lov_tgts, lov->lov_tgt_size); } - OBD_ALLOC_PTR(tgt); if (!tgt) { mutex_up(&lov->lov_lock); @@ -683,6 +682,11 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, lov->lov_tgts[index] = tgt; if (index >= lov->desc.ld_tgt_count) lov->desc.ld_tgt_count = index + 1; + + rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size); + if (rc) + RETURN(rc); + mutex_up(&lov->lov_lock); CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n", @@ -781,6 +785,7 @@ static void __lov_del_obd(struct obd_device *obd, __u32 index) * maximum tgt index for computing the mds_max_easize. So we can't * shrink it. */ + lov_ost_pool_remove(&lov->lov_packed, index); lov->lov_tgts[index] = NULL; OBD_FREE_PTR(tgt); @@ -841,6 +846,7 @@ static int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) struct lov_desc *desc; struct lov_obd *lov = &obd->u.lov; int count; + int rc; ENTRY; if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { @@ -884,16 +890,27 @@ static int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) desc->ld_active_tgt_count = 0; lov->desc = *desc; lov->lov_tgt_size = 0; + rc = lov_ost_pool_init(&lov->lov_packed, 0); + if (rc) + RETURN(rc); + sema_init(&lov->lov_lock, 1); atomic_set(&lov->lov_refcount, 0); CFS_INIT_LIST_HEAD(&lov->lov_qos.lq_oss_list); init_rwsem(&lov->lov_qos.lq_rw_sem); lov->lov_qos.lq_dirty = 1; - lov->lov_qos.lq_dirty_rr = 1; + lov->lov_qos.lq_rr.lqr_dirty = 1; lov->lov_qos.lq_reset = 1; /* Default priority is toward free space balance */ lov->lov_qos.lq_prio_free = 232; + lov->lov_pools_hash_body = lustre_hash_init("POOLS", 128, 128, + &pool_hash_operations, + 0); + + CFS_INIT_LIST_HEAD(&lov->lov_pool_list); + lov->lov_pool_count = 0; + lprocfs_lov_init_vars(&lvars); lprocfs_obd_setup(obd, lvars.obd_vars); #ifdef LPROCFS @@ -906,6 +923,9 @@ static int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) CWARN("Error adding the target_obd file\n"); } #endif + lov->lov_pool_proc_entry = lprocfs_register("pools", + obd->obd_proc_entry, + NULL, NULL); RETURN(0); } @@ -939,8 +959,23 @@ static int lov_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) static int lov_cleanup(struct obd_device *obd) { struct lov_obd *lov = &obd->u.lov; + struct list_head *pos, *tmp; + struct pool_desc *pool; + + list_for_each_safe(pos, tmp, &lov->lov_pool_list) { + pool = list_entry(pos, struct pool_desc, pool_list); + list_del(&pool->pool_list); + lustre_hash_del_key(lov->lov_pools_hash_body, pool->pool_name); + lov_ost_pool_free(&(pool->pool_rr.lqr_pool)); + lov_ost_pool_free(&(pool->pool_obds)); + OBD_FREE(pool, sizeof(*pool)); + } + lustre_hash_exit(lov->lov_pools_hash_body); lprocfs_obd_cleanup(obd); + + lov_ost_pool_free(&lov->lov_packed); + if (lov->lov_tgts) { int i; for (i = 0; i < lov->desc.ld_tgt_count; i++) { @@ -964,8 +999,7 @@ static int lov_cleanup(struct obd_device *obd) lov->lov_tgt_size = 0; } - if (lov->lov_qos.lq_rr_size) - OBD_FREE(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size); + lov_ost_pool_free(&(lov->lov_qos.lq_rr.lqr_pool)); RETURN(0); } @@ -1015,6 +1049,12 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf) lcfg, obd); GOTO(out, rc); } + case LCFG_POOL_NEW: + case LCFG_POOL_ADD: + case LCFG_POOL_DEL: + case LCFG_POOL_REM: + GOTO(out, rc); + default: { CERROR("Unknown command: %d\n", lcfg->lcfg_command); GOTO(out, rc = -EINVAL); @@ -1193,7 +1233,8 @@ static int lov_create(struct obd_export *exp, struct obdo *src_oa, #define ASSERT_LSM_MAGIC(lsmp) \ do { \ LASSERT((lsmp) != NULL); \ - LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC || \ + LASSERTF(((lsmp)->lsm_magic == LOV_MAGIC_V1 || \ + (lsmp)->lsm_magic == LOV_MAGIC_V3 || \ (lsmp)->lsm_magic == LOV_MAGIC_JOIN), "%p->lsm_magic=%x\n", \ (lsmp), (lsmp)->lsm_magic); \ } while (0) @@ -3332,6 +3373,10 @@ struct obd_ops lov_obd_ops = { .o_unregister_page_removal_cb = lov_unregister_page_removal_cb, .o_register_lock_cancel_cb = lov_register_lock_cancel_cb, .o_unregister_lock_cancel_cb = lov_unregister_lock_cancel_cb, + .o_pool_new = lov_pool_new, + .o_pool_rem = lov_pool_remove, + .o_pool_add = lov_pool_add, + .o_pool_del = lov_pool_del, }; static quota_interface_t *quota_interface; diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index 8b2003d..b02c980 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -94,6 +94,52 @@ void lov_dump_lmm_join(int level, struct lov_mds_md_join *lmmj) le32_to_cpu(lmmj->lmmj_extent_count)); } +void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm) +{ + struct lov_ost_data_v1 *lod; + int i; + + CDEBUG(level, "objid "LPX64", magic 0x%08x, pattern %#x\n", + le64_to_cpu(lmm->lmm_object_id), le32_to_cpu(lmm->lmm_magic), + le32_to_cpu(lmm->lmm_pattern)); + CDEBUG(level,"stripe_size %u, stripe_count %u\n", + le32_to_cpu(lmm->lmm_stripe_size), + le32_to_cpu(lmm->lmm_stripe_count)); + CDEBUG(level,"pool_name "POOLNAMEF"\n", lmm->lmm_pool_name); + + if (le32_to_cpu(lmm->lmm_stripe_count) <= LOV_V1_INSANE_STRIPE_COUNT) { + for (i = 0, lod = lmm->lmm_objects; + i < (int)le32_to_cpu(lmm->lmm_stripe_count); i++, lod++) + CDEBUG(level, + "stripe %u idx %u subobj "LPX64"/"LPX64"\n", + i, le32_to_cpu(lod->l_ost_idx), + le64_to_cpu(lod->l_object_gr), + le64_to_cpu(lod->l_object_id)); + } else { + CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n", + le32_to_cpu(lmm->lmm_stripe_count), + LOV_V1_INSANE_STRIPE_COUNT); + } +} + +void lov_dump_lmm(int level, void *lmm) +{ + int magic; + + magic = ((struct lov_mds_md_v1 *)(lmm))->lmm_magic; + switch (magic) { + case LOV_MAGIC_V1: + return lov_dump_lmm_v1(level, (struct lov_mds_md_v1 *)(lmm)); + case LOV_MAGIC_JOIN: + return lov_dump_lmm_join(level, (struct lov_mds_md_join *)(lmm)); + case LOV_MAGIC_V3: + return lov_dump_lmm_v3(level, (struct lov_mds_md_v3 *)(lmm)); + default: + CERROR("Cannot recognize lmm_magic %x", magic); + } + return; +} + #define LMM_ASSERT(test) \ do { \ if (!(test)) lov_dump_lmm(D_ERROR, lmm); \ @@ -113,37 +159,51 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, { struct obd_device *obd = class_exp2obd(exp); struct lov_obd *lov = &obd->u.lov; - struct lov_mds_md *lmm; + struct lov_mds_md_v1 *lmmv1; + struct lov_mds_md_v3 *lmmv3; int stripe_count = lov->desc.ld_tgt_count; - int lmm_size; + struct lov_ost_data_v1 *lmm_objects; + int lmm_size, lmm_magic; int i; ENTRY; if (lsm) { - if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X\n", - lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); - } + lmm_magic = lsm->lsm_magic; + /* If we are just sizing the EA, limit the stripe count * to the actual number of OSTs in this filesystem. */ if (!lmmp) { - stripe_count = lov_get_stripecnt(lov, lsm->lsm_stripe_count); + stripe_count = lov_get_stripecnt(lov, + lsm->lsm_stripe_count); lsm->lsm_stripe_count = stripe_count; } else { stripe_count = lsm->lsm_stripe_count; } + } else if (lmmp && *lmmp) { + lmm_magic = le32_to_cpu((*lmmp)->lmm_magic); + } else { + /* lsm == NULL and lmmp == NULL */ + lmm_magic = LOV_MAGIC; + } + + if ((lmm_magic != LOV_MAGIC_V1) && + (lmm_magic != LOV_MAGIC_V3)) { + CERROR("bad mem LOV MAGIC: 0x%08X != 0x%08X nor 0x%08X\n", + lmm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3); + RETURN(-EINVAL); + } /* XXX LOV STACKING call into osc for sizes */ - lmm_size = lov_mds_md_size(stripe_count); + lmm_size = lov_mds_md_size(stripe_count, lmm_magic); if (!lmmp) RETURN(lmm_size); if (*lmmp && !lsm) { stripe_count = le32_to_cpu((*lmmp)->lmm_stripe_count); - OBD_FREE(*lmmp, lov_mds_md_size(stripe_count)); + lmm_size = lov_mds_md_size(stripe_count, lmm_magic); + OBD_FREE(*lmmp, lmm_size); *lmmp = NULL; RETURN(0); } @@ -154,28 +214,44 @@ int lov_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, RETURN(-ENOMEM); } - lmm = *lmmp; - lmm->lmm_magic = cpu_to_le32(LOV_MAGIC); /* only write new format */ + CDEBUG(D_INFO, "lov_packmd: LOV_MAGIC 0x%08X, lmm_size = %d \n", + lmm_magic, lmm_size); + + lmmv1 = *lmmp; + lmmv3 = (struct lov_mds_md_v3 *)*lmmp; + if (lmm_magic == LOV_MAGIC_V3) + lmmv3->lmm_magic = cpu_to_le32(LOV_MAGIC_V3); + else + lmmv1->lmm_magic = cpu_to_le32(LOV_MAGIC_V1); if (!lsm) RETURN(lmm_size); - lmm->lmm_object_id = cpu_to_le64(lsm->lsm_object_id); - lmm->lmm_object_gr = cpu_to_le64(lsm->lsm_object_gr); - lmm->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size); - lmm->lmm_stripe_count = cpu_to_le32(stripe_count); - lmm->lmm_pattern = cpu_to_le32(lsm->lsm_pattern); + /* lmmv1 and lmmv3 point to the same struct and have the + * same first fields + */ + lmmv1->lmm_object_id = cpu_to_le64(lsm->lsm_object_id); + lmmv1->lmm_object_gr = cpu_to_le64(lsm->lsm_object_gr); + lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size); + lmmv1->lmm_stripe_count = cpu_to_le32(stripe_count); + lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern); + if (lsm->lsm_magic == LOV_MAGIC_V3) { + strncpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name, MAXPOOLNAME); + lmm_objects = lmmv3->lmm_objects; + } else { + lmm_objects = lmmv1->lmm_objects; + } for (i = 0; i < stripe_count; i++) { struct lov_oinfo *loi = lsm->lsm_oinfo[i]; /* XXX LOV STACKING call down to osc_packmd() to do packing */ LASSERTF(loi->loi_id, "lmm_oid "LPU64" stripe %u/%u idx %u\n", - lmm->lmm_object_id, i, stripe_count, loi->loi_ost_idx); - lmm->lmm_objects[i].l_object_id = cpu_to_le64(loi->loi_id); - lmm->lmm_objects[i].l_object_gr = cpu_to_le64(loi->loi_gr); - lmm->lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen); - lmm->lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx); + lmmv1->lmm_object_id, i, stripe_count, loi->loi_ost_idx); + lmm_objects[i].l_object_id = cpu_to_le64(loi->loi_id); + lmm_objects[i].l_object_gr = cpu_to_le64(loi->loi_gr); + lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen); + lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx); } RETURN(lmm_size); @@ -205,9 +281,22 @@ static int lov_verify_lmm(void *lmm, int lmm_bytes, int *stripe_count) int rc; if (lsm_op_find(le32_to_cpu(*(__u32 *)lmm)) == NULL) { - CERROR("bad disk LOV MAGIC: 0x%08X; dumping V1 LMM:\n", - le32_to_cpu(*(__u32 *)lmm)); - lov_dump_lmm_v1(D_WARNING, lmm); + char *buffer; + int sz; + + CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n", + le32_to_cpu(*(__u32 *)lmm), lmm_bytes); + sz = lmm_bytes * 2 + 1; + OBD_ALLOC(buffer, sz); + if (buffer != NULL) { + int i; + + for (i = 0; i < lmm_bytes; i++) + sprintf(buffer+2*i, "%.2X", ((char *)lmm)[i]); + buffer[sz] = '\0'; + CERROR("%s\n", buffer); + OBD_FREE(buffer, sz); + } return -EINVAL; } rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm, @@ -234,6 +323,7 @@ int lov_alloc_memmd(struct lov_stripe_md **lsmp, int stripe_count, (*lsmp)->lsm_stripe_count = stripe_count; (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count; (*lsmp)->lsm_pattern = pattern; + (*lsmp)->lsm_pool_name[0] = '\0'; (*lsmp)->lsm_oinfo[0]->loi_ost_idx = ~0; for (i = 0; i < stripe_count; i++) @@ -312,68 +402,108 @@ static int __lov_setstripe(struct obd_export *exp, struct lov_stripe_md **lsmp, { struct obd_device *obd = class_exp2obd(exp); struct lov_obd *lov = &obd->u.lov; - struct lov_user_md lum; + struct lov_user_md_v3 lumv3; + struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; + int lmm_magic; int stripe_count; int rc; ENTRY; - rc = copy_from_user(&lum, lump, sizeof(lum)); + rc = copy_from_user(&lumv3, lump, sizeof(struct lov_user_md_v1)); if (rc) RETURN(-EFAULT); - if (lum.lmm_magic != LOV_USER_MAGIC) { - if (lum.lmm_magic == __swab32(LOV_USER_MAGIC)) { - lustre_swab_lov_user_md(&lum); - } else { - CDEBUG(D_IOCTL, "bad userland LOV MAGIC:" - " %#08x != %#08x\n", - lum.lmm_magic, LOV_USER_MAGIC); - RETURN(-EINVAL); - } + lmm_magic = lumv1->lmm_magic; + + if (lmm_magic == __swab32(LOV_USER_MAGIC_V1)) { + lustre_swab_lov_user_md_v1(lumv1); + lmm_magic = LOV_USER_MAGIC_V1; + } else if (lmm_magic == LOV_USER_MAGIC_V3) { + rc = copy_from_user(&lumv3, lump, sizeof(lumv3)); + if (rc) + RETURN(-EFAULT); + } else if (lmm_magic == __swab32(LOV_USER_MAGIC_V3)) { + rc = copy_from_user(&lumv3, lump, sizeof(lumv3)); + if (rc) + RETURN(-EFAULT); + lustre_swab_lov_user_md_v3(&lumv3); + lmm_magic = LOV_USER_MAGIC_V3; + } else if (lmm_magic != LOV_USER_MAGIC_V1) { + CDEBUG(D_IOCTL, + "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n", + lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3); + RETURN(-EINVAL); } - if (lum.lmm_pattern == 0) { - lum.lmm_pattern = lov->desc.ld_pattern ? + /* in the rest of the tests, as *lumv1 and lumv3 have the same + * fields, we use lumv1 to avoid code duplication */ + + if (lumv1->lmm_pattern == 0) { + lumv1->lmm_pattern = lov->desc.ld_pattern ? lov->desc.ld_pattern : LOV_PATTERN_RAID0; } - if (lum.lmm_pattern != LOV_PATTERN_RAID0) { + if (lumv1->lmm_pattern != LOV_PATTERN_RAID0) { CDEBUG(D_IOCTL, "bad userland stripe pattern: %#x\n", - lum.lmm_pattern); + lumv1->lmm_pattern); RETURN(-EINVAL); } /* 64kB is the largest common page size we see (ia64), and matches the * check in lfs */ - if (lum.lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) { + if (lumv1->lmm_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) { CDEBUG(D_IOCTL, "stripe size %u not multiple of %u, fixing\n", - lum.lmm_stripe_size, LOV_MIN_STRIPE_SIZE); - lum.lmm_stripe_size = LOV_MIN_STRIPE_SIZE; + lumv1->lmm_stripe_size, LOV_MIN_STRIPE_SIZE); + lumv1->lmm_stripe_size = LOV_MIN_STRIPE_SIZE; } - if ((lum.lmm_stripe_offset >= lov->desc.ld_tgt_count) && - (lum.lmm_stripe_offset != (typeof(lum.lmm_stripe_offset))(-1))) { + if ((lumv1->lmm_stripe_offset >= lov->desc.ld_tgt_count) && + (lumv1->lmm_stripe_offset != + (typeof(lumv1->lmm_stripe_offset))(-1))) { CDEBUG(D_IOCTL, "stripe offset %u > number of OSTs %u\n", - lum.lmm_stripe_offset, lov->desc.ld_tgt_count); + lumv1->lmm_stripe_offset, lov->desc.ld_tgt_count); RETURN(-EINVAL); } - stripe_count = lov_get_stripecnt(lov, lum.lmm_stripe_count); + stripe_count = lov_get_stripecnt(lov, lumv1->lmm_stripe_count); + + if (lmm_magic == LOV_USER_MAGIC_V3) { + struct pool_desc *pool; + + pool = lov_find_pool(lov, lumv3.lmm_pool_name); + if (pool == NULL) + RETURN(-EINVAL); + + if (lumv3.lmm_stripe_offset != + (typeof(lumv3.lmm_stripe_offset))(-1)) { + rc = lov_check_index_in_pool(lumv3.lmm_stripe_offset, + pool); + if (rc < 0) + RETURN(-EINVAL); + } + + if (stripe_count > pool_tgt_count(pool)) + stripe_count = pool_tgt_count(pool); + } - if ((__u64)lum.lmm_stripe_size * stripe_count > ~0UL) { + if ((__u64)lumv1->lmm_stripe_size * stripe_count > ~0UL) { CDEBUG(D_IOCTL, "stripe width %ux%i exeeds %lu bytes\n", - lum.lmm_stripe_size, (int)lum.lmm_stripe_count, ~0UL); + lumv1->lmm_stripe_size, (int)lumv1->lmm_stripe_count, + ~0UL); RETURN(-EINVAL); } - rc = lov_alloc_memmd(lsmp, stripe_count, lum.lmm_pattern, LOV_MAGIC); + rc = lov_alloc_memmd(lsmp, stripe_count, lumv1->lmm_pattern, lmm_magic); if (rc >= 0) { - (*lsmp)->lsm_oinfo[0]->loi_ost_idx = lum.lmm_stripe_offset; - (*lsmp)->lsm_stripe_size = lum.lmm_stripe_size; + (*lsmp)->lsm_oinfo[0]->loi_ost_idx = lumv1->lmm_stripe_offset; + (*lsmp)->lsm_stripe_size = lumv1->lmm_stripe_size; + if (lmm_magic == LOV_USER_MAGIC_V3) + strncpy((*lsmp)->lsm_pool_name, lumv3.lmm_pool_name, + MAXPOOLNAME); rc = 0; } - RETURN(0); + RETURN(rc); } /* Configure object striping information on a new file. @@ -405,20 +535,27 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp, struct obd_export *oexp; struct lov_obd *lov = &exp->exp_obd->u.lov; obd_id last_id = 0; + struct lov_user_ost_data_v1 *lmm_objects; ENTRY; + + if (lump->lmm_magic == LOV_USER_MAGIC_V3) + lmm_objects = ((struct lov_user_md_v3 *)lump)->lmm_objects; + else + lmm_objects = lump->lmm_objects; + for (i = 0; i < lump->lmm_stripe_count; i++) { __u32 len = sizeof(last_id); - oexp = lov->lov_tgts[lump->lmm_objects[i].l_ost_idx]->ltd_exp; + oexp = lov->lov_tgts[lmm_objects[i].l_ost_idx]->ltd_exp; rc = obd_get_info(oexp, sizeof(KEY_LAST_ID), KEY_LAST_ID, &len, &last_id, NULL); if (rc) RETURN(rc); - if (lump->lmm_objects[i].l_object_id > last_id) { + if (lmm_objects[i].l_object_id > last_id) { CERROR("Setting EA for object > than last id on " "ost idx %d "LPD64" > "LPD64" \n", - lump->lmm_objects[i].l_ost_idx, - lump->lmm_objects[i].l_object_id, last_id); + lmm_objects[i].l_ost_idx, + lmm_objects[i].l_object_id, last_id); RETURN(-EINVAL); } } @@ -429,9 +566,9 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp, for (i = 0; i < lump->lmm_stripe_count; i++) { (*lsmp)->lsm_oinfo[i]->loi_ost_idx = - lump->lmm_objects[i].l_ost_idx; - (*lsmp)->lsm_oinfo[i]->loi_id = lump->lmm_objects[i].l_object_id; - (*lsmp)->lsm_oinfo[i]->loi_gr = lump->lmm_objects[i].l_object_gr; + lmm_objects[i].l_ost_idx; + (*lsmp)->lsm_oinfo[i]->loi_id = lmm_objects[i].l_object_id; + (*lsmp)->lsm_oinfo[i]->loi_gr = lmm_objects[i].l_object_gr; } RETURN(0); } @@ -449,9 +586,11 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, /* * XXX huge struct allocated on stack. */ - struct lov_user_md lum; + /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ + struct lov_user_md_v3 lum; struct lov_mds_md *lmmk = NULL; int rc, lmm_size; + int lum_size; mm_segment_t seg; ENTRY; @@ -464,12 +603,22 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, */ seg = get_fs(); set_fs(KERNEL_DS); - rc = copy_from_user(&lum, lump, sizeof(lum)); + + /* we only need the header part from user space to get lmm_magic and + * lmm_stripe_count, (the header part is common to v1 and v3) */ + lum_size = sizeof(struct lov_user_md_v1); + rc = copy_from_user(&lum, lump, lum_size); + if (rc) rc = -EFAULT; - else if (lum.lmm_magic != LOV_USER_MAGIC) + else if ((lum.lmm_magic != LOV_USER_MAGIC) && + (lum.lmm_magic != LOV_USER_MAGIC_V3)) rc = -EINVAL; else { + /* if v3 we just have to update the lum_size */ + if (lum.lmm_magic == LOV_USER_MAGIC_V3) + lum_size = sizeof(struct lov_user_md_v3); + rc = lov_packmd(exp, &lmmk, lsm); if (rc < 0) RETURN(rc); @@ -477,17 +626,18 @@ int lov_getstripe(struct obd_export *exp, struct lov_stripe_md *lsm, rc = 0; /* FIXME: Bug 1185 - copy fields properly when structs change */ - CLASSERT(sizeof lum == sizeof *lmmk); + /* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */ + CLASSERT(sizeof(lum) == sizeof(struct lov_mds_md_v3)); CLASSERT(sizeof lum.lmm_objects[0] == sizeof lmmk->lmm_objects[0]); /* User wasn't expecting this many OST entries */ if (lum.lmm_stripe_count == 0) { - if (copy_to_user(lump, lmmk, sizeof lum)) + if (copy_to_user(lump, lmmk, lum_size)) rc = -EFAULT; } else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) { rc = -EOVERFLOW; - } else if (copy_to_user(lump, lmmk, sizeof lum)) + } else if (copy_to_user(lump, lmmk, lmm_size)) rc = -EFAULT; obd_free_diskmd(exp, &lmmk); diff --git a/lustre/lov/lov_pool.c b/lustre/lov/lov_pool.c new file mode 100644 index 0000000..05fde47 --- /dev/null +++ b/lustre/lov/lov_pool.c @@ -0,0 +1,619 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see [sun.com URL with a + * copy of GPLv2]. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lov/lov_pool.c + * + * OST pool methods + * + * Author: Jacques-Charles LAFOUCRIERE + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#include +#include "lov_internal.h" + +/* + * hash function using a Rotating Hash algorithm + * Knuth, D. The Art of Computer Programming, + * Volume 3: Sorting and Searching, + * Chapter 6.4. + * Addison Wesley, 1973 + */ +static __u32 pool_hashfn(lustre_hash_t *hash_body, void *key, unsigned mask) +{ + int i; + __u32 result; + char *poolname; + + result = 0; + poolname = (char *)key; + for (i = 0; i < MAXPOOLNAME; i++) { + if (poolname[i] == '\0') + break; + result = (result << 4)^(result >> 28) ^ poolname[i]; + } + return (result % mask); +} + +static void *pool_key(struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + return (pool->pool_name); +} + +static int pool_hashkey_compare(void *key, struct hlist_node *compared_hnode) +{ + char *pool_name; + struct pool_desc *pool; + int rc; + + pool_name = (char *)key; + pool = hlist_entry(compared_hnode, struct pool_desc, pool_hash); + rc = strncmp(pool_name, pool->pool_name, MAXPOOLNAME); + return (!rc); +} + +static void *pool_hashrefcount_get(struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + return (pool); +} + +static void *pool_hashrefcount_put(struct hlist_node *hnode) +{ + struct pool_desc *pool; + + pool = hlist_entry(hnode, struct pool_desc, pool_hash); + return (pool); +} + +lustre_hash_ops_t pool_hash_operations = { + .lh_hash = pool_hashfn, + .lh_key = pool_key, + .lh_compare = pool_hashkey_compare, + .lh_get = pool_hashrefcount_get, + .lh_put = pool_hashrefcount_put, +}; + +#ifdef LPROCFS +/* ifdef needed for liblustre support */ +/* + * pool /proc seq_file methods + */ +/* + * iterator is used to go through the target pool entries + * index is the current entry index in the lp_array[] array + * index >= pos returned to the seq_file interface + * pos is from 0 to (pool->pool_obds.op_count - 1) + */ +#define POOL_IT_MAGIC 0xB001CEA0 +struct pool_iterator { + int magic; + struct pool_desc *pool; + int idx; /* from 0 to pool_tgt_size - 1 */ +}; + +static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + int prev_idx; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic); + + /* test if end of file */ + if (*pos >= pool_tgt_count(iter->pool)) + return NULL; + + /* iterate to find a non empty entry */ + prev_idx = iter->idx; + read_lock(&pool_tgt_rwlock(iter->pool)); + iter->idx++; + if (iter->idx == pool_tgt_count(iter->pool)) { + iter->idx = prev_idx; /* we stay on the last entry */ + read_unlock(&pool_tgt_rwlock(iter->pool)); + return NULL; + } + read_unlock(&pool_tgt_rwlock(iter->pool)); + (*pos)++; + /* return != NULL to continue */ + return iter; +} + +static void *pool_proc_start(struct seq_file *s, loff_t *pos) +{ + struct pool_desc *pool = (struct pool_desc *)s->private; + struct pool_iterator *iter; + + if ((pool_tgt_count(pool) == 0) || + (*pos >= pool_tgt_count(pool))) + return NULL; + + OBD_ALLOC(iter, sizeof(struct pool_iterator)); + if (!iter) + return ERR_PTR(-ENOMEM); + iter->magic = POOL_IT_MAGIC; + iter->pool = pool; + iter->idx = 0; + + /* we use seq_file private field to memorized iterator so + * we can free it at stop() */ + /* /!\ do not forget to restore it to pool before freeing it */ + s->private = iter; + if (*pos > 0) { + loff_t i; + void *ptr; + + i = 0; + do { + ptr = pool_proc_next(s, &iter, &i); + } while ((i < *pos) && (ptr != NULL)); + return ptr; + } + return iter; +} + +static void pool_proc_stop(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)s->private; + + /* in some cases stop() method is called 2 times, without + * calling start() method (see seq_read() from fs/seq_file.c) + * we have to free only if s->private is an iterator */ + if ((iter) && (iter->magic == POOL_IT_MAGIC)) { + /* we restore s->private so next call to pool_proc_start() + * will work */ + s->private = iter->pool; + OBD_FREE(iter, sizeof(struct pool_iterator)); + } + return; +} + +static int pool_proc_show(struct seq_file *s, void *v) +{ + struct pool_iterator *iter = (struct pool_iterator *)v; + struct lov_tgt_desc *tgt; + + LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X", iter->magic); + LASSERT(iter->pool != NULL); + LASSERT(iter->idx <= pool_tgt_count(iter->pool)); + + read_lock(&pool_tgt_rwlock(iter->pool)); + tgt = pool_tgt(iter->pool, iter->idx); + read_unlock(&pool_tgt_rwlock(iter->pool)); + if (tgt) + seq_printf(s, "%s\n", obd_uuid2str(&(tgt->ltd_uuid))); + + return 0; +} + +static struct seq_operations pool_proc_ops = { + .start = pool_proc_start, + .next = pool_proc_next, + .stop = pool_proc_stop, + .show = pool_proc_show, +}; + +static int pool_proc_open(struct inode *inode, struct file *file) +{ + int rc; + + rc = seq_open(file, &pool_proc_ops); + if (!rc) { + struct seq_file *s = file->private_data; + s->private = PROC_I(inode)->pde->data; + } + return rc; +} + +static struct file_operations pool_proc_operations = { + .open = pool_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* LPROCFS */ + +void lov_dump_pool(int level, struct pool_desc *pool) +{ + int i; + + CDEBUG(level, "pool "POOLNAMEF" has %d members\n", + pool->pool_name, pool->pool_obds.op_count); + read_lock(&pool_tgt_rwlock(pool)); + for (i = 0; i < pool_tgt_count(pool) ; i++) { + if (!pool_tgt(pool, i) || !(pool_tgt(pool, i))->ltd_exp) + continue; + CDEBUG(level, "pool "POOLNAMEF"[%d] = %s\n", pool->pool_name, + i, obd_uuid2str(&((pool_tgt(pool, i))->ltd_uuid))); + } + read_unlock(&pool_tgt_rwlock(pool)); +} + +#define LOV_POOL_INIT_COUNT 2 +int lov_ost_pool_init(struct ost_pool *op, unsigned int count) +{ + if (count == 0) + count = LOV_POOL_INIT_COUNT; + op->op_array = NULL; + op->op_count = 0; + op->op_rwlock = RW_LOCK_UNLOCKED; + op->op_size = count; + OBD_ALLOC(op->op_array, op->op_size * sizeof(op->op_array[0])); + if (op->op_array == NULL) { + op->op_size = 0; + return -ENOMEM; + } + return 0; +} + +int lov_ost_pool_extend(struct ost_pool *op, unsigned int max_count) +{ + __u32 *new; + int new_size; + + LASSERT(max_count != 0); + + if (op->op_count < op->op_size) + return 0; + + new_size = min(max_count, 2 * op->op_size); + OBD_ALLOC(new, new_size * sizeof(op->op_array[0])); + if (new == NULL) + return -ENOMEM; + + /* copy old array to new one */ + memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0])); + write_lock(&op->op_rwlock); + OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0])); + op->op_array = new; + op->op_size = new_size; + write_unlock(&op->op_rwlock); + return 0; +} + +int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int max_count) +{ + int rc, i; + + rc = lov_ost_pool_extend(op, max_count); + if (rc) + return rc; + + /* search ost in pool array */ + read_lock(&op->op_rwlock); + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) { + read_unlock(&op->op_rwlock); + return -EEXIST; + } + } + /* ost not found we add it */ + op->op_array[op->op_count] = idx; + op->op_count++; + read_unlock(&op->op_rwlock); + return 0; +} + +int lov_ost_pool_remove(struct ost_pool *op, __u32 idx) +{ + int i; + + read_lock(&op->op_rwlock); + for (i = 0; i < op->op_count; i++) { + if (op->op_array[i] == idx) { + memmove(&op->op_array[i], &op->op_array[i + 1], + (op->op_count - i - 1) * sizeof(op->op_array[0])); + op->op_count--; + read_unlock(&op->op_rwlock); + return 0; + } + } + read_unlock(&op->op_rwlock); + return -EINVAL; +} + +int lov_ost_pool_free(struct ost_pool *op) +{ + if (op->op_size == 0) + return 0; + + write_lock(&op->op_rwlock); + OBD_FREE(op->op_array, op->op_size * sizeof(op->op_array[0])); + op->op_array = NULL; + op->op_count = 0; + op->op_size = 0; + write_unlock(&op->op_rwlock); + return 0; +} + + +int lov_pool_new(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *new_pool; + int rc; + + lov = &(obd->u.lov); + + OBD_ALLOC(new_pool, sizeof(*new_pool)); + + if (new_pool == NULL) + return -ENOMEM; + + if (strlen(poolname) > MAXPOOLNAME) + return -ENAMETOOLONG; + + strncpy(new_pool->pool_name, poolname, MAXPOOLNAME); + new_pool->pool_name[MAXPOOLNAME] = '\0'; + new_pool->pool_lov = lov; + rc = lov_ost_pool_init(&new_pool->pool_obds, 0); + if (rc) + return rc; + + memset(&(new_pool->pool_rr), 0, sizeof(struct lov_qos_rr)); + rc = lov_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0); + if (rc) + return rc; + + spin_lock(&obd->obd_dev_lock); + /* check if pool alreaddy exists */ + if (lustre_hash_lookup(lov->lov_pools_hash_body, + poolname) != NULL) { + spin_unlock(&obd->obd_dev_lock); + lov_ost_pool_free(&new_pool->pool_obds); + OBD_FREE(new_pool, sizeof(*new_pool)); + return -EEXIST; + } + + INIT_HLIST_NODE(&new_pool->pool_hash); + lustre_hash_add_unique(lov->lov_pools_hash_body, poolname, + &new_pool->pool_hash); + list_add_tail(&new_pool->pool_list, &lov->lov_pool_list); + lov->lov_pool_count++; + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_CONFIG, POOLNAMEF" is pool #%d\n", + poolname, lov->lov_pool_count); + +#ifdef LPROCFS + /* ifdef needed for liblustre */ + new_pool->pool_proc_entry = lprocfs_add_simple(lov->lov_pool_proc_entry, + poolname, + NULL, NULL, + new_pool, + &pool_proc_operations); +#endif + + if (IS_ERR(new_pool->pool_proc_entry)) { + CWARN("Cannot add proc pool entry "POOLNAMEF"\n", poolname); + new_pool->pool_proc_entry = NULL; + } + + return 0; +} + +int lov_pool_del(struct obd_device *obd, char *poolname) +{ + struct lov_obd *lov; + struct pool_desc *pool; + + lov = &(obd->u.lov); + + spin_lock(&obd->obd_dev_lock); + pool = lustre_hash_lookup(lov->lov_pools_hash_body, + poolname); + if (pool == NULL) { + spin_unlock(&obd->obd_dev_lock); + return -ENOENT; + } + +#ifdef LPROCFS + if (pool->pool_proc_entry != NULL) + remove_proc_entry(pool->pool_proc_entry->name, + pool->pool_proc_entry->parent); +#endif + + /* pool is kept in the list to be freed by lov_cleanup() + * list_del(&pool->pool_list); + */ + lustre_hash_del_key(lov->lov_pools_hash_body, poolname); + + lov->lov_pool_count--; + + spin_unlock(&obd->obd_dev_lock); + + /* pool struct is not freed because it may be used by + * some open in /proc + * the struct is freed at lov_cleanup() + */ + /* + if (pool->pool_rr.lqr_size != 0) + OBD_FREE(pool->pool_rr.lqr_array, pool->pool_rr.lqr_size); + lov_ost_pool_free(&pool->pool_obds); + OBD_FREE(pool, sizeof(*pool)); + */ + return 0; +} + + +int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int i, lov_idx; + int rc; + + lov = &(obd->u.lov); + + pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) { + return -ENOENT; + } + + /* allocate pool tgt array if needed */ + mutex_down(&lov->lov_lock); + rc = lov_ost_pool_extend(&pool->pool_obds, lov->lov_tgt_size); + if (rc) { + mutex_up(&lov->lov_lock); + return rc; + } + mutex_up(&lov->lov_lock); + + obd_str2uuid(&ost_uuid, ostname); + + spin_lock(&obd->obd_dev_lock); + + /* search ost in lov array */ + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid))) + break; + } + + /* test if ost found in lov */ + if (i == lov->desc.ld_tgt_count) { + spin_unlock(&obd->obd_dev_lock); + return -EINVAL; + } + + spin_unlock(&obd->obd_dev_lock); + + lov_idx = i; + + rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size); + if (rc) + return rc; + + pool->pool_rr.lqr_dirty = 1; + + CDEBUG(D_CONFIG, "Added %s to "POOLNAMEF" as member %d\n", + ostname, poolname, pool_tgt_count(pool)); + return 0; +} + +int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) +{ + struct obd_uuid ost_uuid; + struct lov_obd *lov; + struct pool_desc *pool; + unsigned int i, lov_idx; + + lov = &(obd->u.lov); + + spin_lock(&obd->obd_dev_lock); + pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) { + spin_unlock(&obd->obd_dev_lock); + return -ENOENT; + } + + obd_str2uuid(&ost_uuid, ostname); + + /* search ost in lov array, to get index */ + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + if (!lov->lov_tgts[i]) + continue; + + if (obd_uuid_equals(&ost_uuid, &(lov->lov_tgts[i]->ltd_uuid))) + break; + } + + /* test if ost found in lov */ + if (i == lov->desc.ld_tgt_count) { + spin_unlock(&obd->obd_dev_lock); + return -EINVAL; + } + + spin_unlock(&obd->obd_dev_lock); + + lov_idx = i; + + lov_ost_pool_remove(&pool->pool_obds, lov_idx); + + pool->pool_rr.lqr_dirty = 1; + + CDEBUG(D_CONFIG, "%s removed from "POOLNAMEF"\n", ostname, poolname); + + return 0; +} + +int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool) +{ + int i; + + read_lock(&pool_tgt_rwlock(pool)); + for (i = 0; i < pool_tgt_count(pool); i++) { + if (pool_tgt_array(pool)[i] == idx) { + read_unlock(&pool_tgt_rwlock(pool)); + return 0; + } + } + read_unlock(&pool_tgt_rwlock(pool)); + return -ENOENT; +} + +struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname) +{ + struct pool_desc *pool; + + pool = NULL; + if (poolname[0] != '\0') { + pool = lustre_hash_lookup(lov->lov_pools_hash_body, poolname); + if (pool == NULL) + CWARN("Request for an unknown pool ("POOLNAMEF")\n", + poolname); + if ((pool != NULL) && (pool_tgt_count(pool) == 0)) { + CWARN("Request for an empty pool ("POOLNAMEF")\n", + poolname); + pool = NULL; + } + } + return pool; +} + diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index 053ef6c..4a97573 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -108,7 +108,7 @@ int qos_add_tgt(struct obd_device *obd, __u32 index) list_add_tail(&oss->lqo_oss_list, &temposs->lqo_oss_list); lov->lov_qos.lq_dirty = 1; - lov->lov_qos.lq_dirty_rr = 1; + lov->lov_qos.lq_rr.lqr_dirty = 1; CDEBUG(D_QOS, "add tgt %s to OSS %s (%d OSTs)\n", obd_uuid2str(&lov->lov_tgts[index]->ltd_uuid), @@ -146,7 +146,7 @@ int qos_del_tgt(struct obd_device *obd, __u32 index) } lov->lov_qos.lq_dirty = 1; - lov->lov_qos.lq_dirty_rr = 1; + lov->lov_qos.lq_rr.lqr_dirty = 1; out: up_write(&lov->lov_qos.lq_rw_sem); RETURN(rc); @@ -268,10 +268,11 @@ static int qos_calc_weight(struct lov_obd *lov, int i) } /* We just used this index for a stripe; adjust everyone's weights */ -static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt) +static int qos_used(struct lov_obd *lov, struct ost_pool *osts, + __u32 index, __u64 *total_wt) { struct lov_qos_oss *oss; - int i; + int j; ENTRY; /* Don't allocate from this stripe anymore, until the next alloc_qos */ @@ -301,7 +302,10 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt) *total_wt = 0; /* Decrease all OST penalties */ - for (i = 0; i < lov->desc.ld_tgt_count; i++) { + for (j = 0; j < osts->op_count; j++) { + int i; + + i = osts->op_array[j]; if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) continue; if (lov->lov_tgts[i]->ltd_qos.ltq_penalty < @@ -318,10 +322,11 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt) *total_wt += lov->lov_tgts[i]->ltd_qos.ltq_weight; #ifdef QOS_DEBUG - CDEBUG(D_QOS, "recalc tgt %d avail="LPU64 + CDEBUG(D_QOS, "recalc tgt %d usable=%d avail="LPU64 " ostppo="LPU64" ostp="LPU64" ossppo="LPU64 " ossp="LPU64" wt="LPU64"\n", - i, TGT_BAVAIL(i) >> 10, + i, lov->lov_tgts[i]->ltd_qos.ltq_usable, + TGT_BAVAIL(i) >> 10, lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj >> 10, lov->lov_tgts[i]->ltd_qos.ltq_penalty >> 10, lov->lov_tgts[i]->ltd_qos.ltq_oss->lqo_penalty_per_obj>>10, @@ -335,15 +340,16 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt) #define LOV_QOS_EMPTY ((__u32)-1) /* compute optimal round-robin order, based on OSTs per OSS */ -static int qos_calc_rr(struct lov_obd *lov) +static int qos_calc_rr(struct lov_obd *lov, struct ost_pool *src_pool, + struct lov_qos_rr *lqr) { struct lov_qos_oss *oss; - unsigned ost_count, placed, real_count; - int i; + unsigned placed, real_count; + int i, rc; ENTRY; - if (!lov->lov_qos.lq_dirty_rr) { - LASSERT(lov->lov_qos.lq_rr_size); + if (!lqr->lqr_dirty) { + LASSERT(lqr->lqr_pool.op_size); RETURN(0); } @@ -354,54 +360,45 @@ static int qos_calc_rr(struct lov_obd *lov) * Check again. While we were sleeping on @lq_rw_sem something could * change. */ - if (!lov->lov_qos.lq_dirty_rr) { - LASSERT(lov->lov_qos.lq_rr_size); + if (!lqr->lqr_dirty) { + LASSERT(lqr->lqr_pool.op_size); up_write(&lov->lov_qos.lq_rw_sem); RETURN(0); } - ost_count = lov->desc.ld_tgt_count; - - if (lov->lov_qos.lq_rr_size) - OBD_FREE(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size); - lov->lov_qos.lq_rr_size = ost_count * - sizeof(lov->lov_qos.lq_rr_array[0]); - OBD_ALLOC(lov->lov_qos.lq_rr_array, lov->lov_qos.lq_rr_size); - if (!lov->lov_qos.lq_rr_array) { - lov->lov_qos.lq_rr_size = 0; + if (lqr->lqr_pool.op_size) + lov_ost_pool_free(&lqr->lqr_pool); + rc = lov_ost_pool_init(&lqr->lqr_pool, src_pool->op_count); + if (rc) { up_write(&lov->lov_qos.lq_rw_sem); - RETURN(-ENOMEM); + RETURN(rc); } - real_count = 0; - for (i = 0; i < ost_count; i++) { - lov->lov_qos.lq_rr_array[i] = LOV_QOS_EMPTY; - if (lov->lov_tgts[i]) - real_count++; - } + for (i = 0; i < src_pool->op_count; i++) + lqr->lqr_pool.op_array[i] = LOV_QOS_EMPTY; + lqr->lqr_pool.op_count = src_pool->op_count; /* Place all the OSTs from 1 OSS at the same time. */ + real_count = lqr->lqr_pool.op_count; placed = 0; list_for_each_entry(oss, &lov->lov_qos.lq_oss_list, lqo_oss_list) { int j = 0; - for (i = 0; i < ost_count; i++) { - if (lov->lov_tgts[i] && - lov->lov_tgts[i]->ltd_qos.ltq_oss == oss) { + for (i = 0; i < lqr->lqr_pool.op_count; i++) { + if (lov->lov_tgts[src_pool->op_array[i]] && + (lov->lov_tgts[src_pool->op_array[i]]->ltd_qos.ltq_oss == oss)) { /* Evenly space these OSTs across arrayspace */ - int next = j * ost_count / oss->lqo_ost_count; - LASSERT(next < ost_count); - while (lov->lov_qos.lq_rr_array[next] != + int next = j * lqr->lqr_pool.op_count / oss->lqo_ost_count; + while (lqr->lqr_pool.op_array[next] != LOV_QOS_EMPTY) - next = (next + 1) % ost_count; - lov->lov_qos.lq_rr_array[next] = i; + next = (next + 1) % lqr->lqr_pool.op_count; + lqr->lqr_pool.op_array[next] = src_pool->op_array[i]; j++; placed++; } } - LASSERT(j == oss->lqo_ost_count); } - lov->lov_qos.lq_dirty_rr = 0; + lqr->lqr_dirty = 0; up_write(&lov->lov_qos.lq_rw_sem); if (placed != real_count) { @@ -409,18 +406,18 @@ static int qos_calc_rr(struct lov_obd *lov) LCONSOLE_ERROR_MSG(0x14e, "Failed to place all OSTs in the " "round-robin list (%d of %d).\n", placed, real_count); - for (i = 0; i < ost_count; i++) { + for (i = 0; i < lqr->lqr_pool.op_count; i++) { LCONSOLE(D_WARNING, "rr #%d ost idx=%d\n", i, - lov->lov_qos.lq_rr_array[i]); + lqr->lqr_pool.op_array[i]); } - lov->lov_qos.lq_dirty_rr = 1; + lqr->lqr_dirty = 1; RETURN(-EAGAIN); } #ifdef QOS_DEBUG - for (i = 0; i < ost_count; i++) { + for (i = 0; i < lqr->lqr_pool.op_count; i++) { LCONSOLE(D_QOS, "rr #%d ost idx=%d\n", i, - lov->lov_qos.lq_rr_array[i]); + lqr->lqr_pool.op_array[i]); } #endif @@ -519,54 +516,70 @@ static int min_stripe_count(int stripe_cnt, int flags) #define LOV_CREATE_RESEED_MIN 1000 /* Allocate objects on osts with round-robin algorithm */ static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt, - int flags) + char *poolname, int flags) { - unsigned array_idx, ost_count = lov->desc.ld_tgt_count; - unsigned ost_active_count = lov->desc.ld_active_tgt_count; + unsigned array_idx; int i, *idx_pos; __u32 ost_idx; int ost_start_idx_temp; int speed = 0; int stripe_cnt_min = min_stripe_count(*stripe_cnt, flags); + struct pool_desc *pool; + struct ost_pool *osts; + struct lov_qos_rr *lqr; ENTRY; - i = qos_calc_rr(lov); - if (i) + pool = lov_find_pool(lov, poolname); + if (pool == NULL) { + osts = &(lov->lov_packed); + lqr = &(lov->lov_qos.lq_rr); + } else { + read_lock(&pool_tgt_rwlock(pool)); + osts = &(pool->pool_obds); + lqr = &(pool->pool_rr); + } + + i = qos_calc_rr(lov, osts, lqr); + if (i) { + if (pool != NULL) + read_unlock(&pool_tgt_rwlock(pool)); RETURN(i); + } - if (--lov->lov_start_count <= 0) { - lov->lov_start_idx = ll_rand() % ost_count; - lov->lov_start_count = - (LOV_CREATE_RESEED_MIN / max(ost_active_count, 1U) + - LOV_CREATE_RESEED_MULT) * max(ost_active_count, 1U); - } else if (stripe_cnt_min >= ost_active_count || - lov->lov_start_idx > ost_count) { + if (--lqr->lqr_start_count <= 0) { + lqr->lqr_start_idx = ll_rand() % osts->op_count; + lqr->lqr_start_count = + (LOV_CREATE_RESEED_MIN / max(osts->op_count, 1U) + + LOV_CREATE_RESEED_MULT) * max(osts->op_count, 1U); + } else if (stripe_cnt_min >= osts->op_count || + lqr->lqr_start_idx > osts->op_count) { /* If we have allocated from all of the OSTs, slowly * precess the next start if the OST/stripe count isn't * already doing this for us. */ - lov->lov_start_idx %= ost_count; - if (*stripe_cnt > 1 && (ost_active_count % (*stripe_cnt)) != 1) - ++lov->lov_offset_idx; + lqr->lqr_start_idx %= osts->op_count; + if (*stripe_cnt > 1 && (osts->op_count % (*stripe_cnt)) != 1) + ++lqr->lqr_offset_idx; } down_read(&lov->lov_qos.lq_rw_sem); - ost_start_idx_temp = lov->lov_start_idx; + ost_start_idx_temp = lqr->lqr_start_idx; repeat_find: - array_idx = (lov->lov_start_idx + lov->lov_offset_idx) % ost_count; + array_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) % osts->op_count; idx_pos = idx_arr; #ifdef QOS_DEBUG - CDEBUG(D_QOS, "want %d startidx %d startcnt %d offset %d active %d " - "count %d arrayidx %d\n", - stripe_cnt, lov->lov_start_idx, lov->lov_start_count, - lov->lov_offset_idx, ost_active_count, ost_count, array_idx); + CDEBUG(D_QOS, "pool '%s' want %d startidx %d startcnt %d offset %d " + "active %d count %d arrayidx %d\n", poolname, + *stripe_cnt, lqr->lqr_start_idx, lqr->lqr_start_count, + lqr->lqr_offset_idx, osts->op_count, osts->op_count, array_idx); #endif - for (i = 0; i < ost_count; i++, array_idx=(array_idx + 1) % ost_count) { - ++lov->lov_start_idx; - ost_idx = lov->lov_qos.lq_rr_array[array_idx]; + for (i = 0; i < osts->op_count; + i++, array_idx=(array_idx + 1) % osts->op_count) { + ++lqr->lqr_start_idx; + ost_idx = lqr->lqr_pool.op_array[array_idx]; #ifdef QOS_DEBUG CDEBUG(D_QOS, "#%d strt %d act %d strp %d ary %d idx %d\n", - i, lov->lov_start_idx, + i, lqr->lqr_start_idx, ((ost_idx != LOV_QOS_EMPTY) && lov->lov_tgts[ost_idx]) ? lov->lov_tgts[ost_idx]->ltd_active : 0, idx_pos - idx_arr, array_idx, ost_idx); @@ -593,10 +606,13 @@ repeat_find: if ((speed < 2) && (idx_pos - idx_arr < stripe_cnt_min)) { /* Try again, allowing slower OSCs */ speed++; - lov->lov_start_idx = ost_start_idx_temp; + lqr->lqr_start_idx = ost_start_idx_temp; goto repeat_find; } + if (pool != NULL) + read_unlock(&pool_tgt_rwlock(pool)); + up_read(&lov->lov_qos.lq_rw_sem); *stripe_cnt = idx_pos - idx_arr; @@ -607,15 +623,45 @@ repeat_find: static int alloc_specific(struct lov_obd *lov, struct lov_stripe_md *lsm, int *idx_arr) { - unsigned ost_idx, ost_count = lov->desc.ld_tgt_count; + unsigned ost_idx, array_idx, ost_count; int i, *idx_pos; int speed = 0; + struct pool_desc *pool = NULL; + struct ost_pool *osts; ENTRY; + pool = lov_find_pool(lov, lsm->lsm_pool_name); + if (pool == NULL) { + osts = &(lov->lov_packed); + } else { + read_lock(&pool_tgt_rwlock(pool)); + osts = &(pool->pool_obds); + } + + ost_count = osts->op_count; + repeat_find: - ost_idx = lsm->lsm_oinfo[0]->loi_ost_idx; + /* search loi_ost_idx in ost array */ + array_idx = 0; + for (i = 0; i < ost_count; i++) { + if (osts->op_array[i] == lsm->lsm_oinfo[0]->loi_ost_idx) { + array_idx = i; + break; + } + } + if (i == ost_count) { + if (pool != NULL) + read_unlock(&pool_tgt_rwlock(pool)); + CERROR("Start index %d not found in pool '%s'\n", + lsm->lsm_oinfo[0]->loi_ost_idx, lsm->lsm_pool_name); + RETURN(-EINVAL); + } + idx_pos = idx_arr; - for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) { + for (i = 0; i < ost_count; + i++, array_idx = (array_idx + 1) % ost_count) { + ost_idx = osts->op_array[array_idx]; + if (!lov->lov_tgts[ost_idx] || !lov->lov_tgts[ost_idx]->ltd_active) { continue; @@ -634,8 +680,11 @@ repeat_find: *idx_pos = ost_idx; idx_pos++; /* We have enough stripes */ - if (idx_pos - idx_arr == lsm->lsm_stripe_count) + if (idx_pos - idx_arr == lsm->lsm_stripe_count) { + if (pool != NULL) + read_unlock(&pool_tgt_rwlock(pool)); RETURN(0); + } } if (speed < 2) { /* Try again, allowing slower OSCs */ @@ -652,6 +701,10 @@ repeat_find: CERROR("can't lstripe objid "LPX64": have %d want %u\n", lsm->lsm_object_id, (int)(idx_pos - idx_arr), lsm->lsm_stripe_count); + + if (pool != NULL) + read_unlock(&pool_tgt_rwlock(pool)); + RETURN(-EFBIG); } @@ -660,20 +713,32 @@ repeat_find: - network resources (shared OSS's) */ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, - int flags) + char *poolname, int flags) { struct lov_obd *lov = &exp->exp_obd->u.lov; static time_t last_warn = 0; time_t now = cfs_time_current_sec(); __u64 total_bavail, total_weight = 0; - __u32 ost_count; int nfound, good_osts, i, warn = 0, rc = 0; int stripe_cnt_min = min_stripe_count(*stripe_cnt, flags); + struct pool_desc *pool; + struct ost_pool *osts; + struct lov_qos_rr *lqr; ENTRY; if (stripe_cnt_min < 1) GOTO(out_nolock, rc = -EINVAL); + pool = lov_find_pool(lov, poolname); + if (pool == NULL) { + osts = &(lov->lov_packed); + lqr = &(lov->lov_qos.lq_rr); + } else { + read_lock(&pool_tgt_rwlock(pool)); + osts = &(pool->pool_obds); + lqr = &(pool->pool_rr); + } + lov_getref(exp->exp_obd); /* Detect -EAGAIN early, before expensive lock is taken. */ @@ -690,8 +755,6 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, if (!lov->lov_qos.lq_dirty && lov->lov_qos.lq_same_space) GOTO(out, rc = -EAGAIN); - ost_count = lov->desc.ld_tgt_count; - if (lov->desc.ld_active_tgt_count < 2) GOTO(out, rc = -EAGAIN); @@ -705,24 +768,25 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, if (cfs_time_sub(now, last_warn) > 60 * 30) warn = 1; /* Find all the OSTs that are valid stripe candidates */ - for (i = 0; i < ost_count; i++) { + for (i = 0; i < osts->op_count; i++) { __u64 bavail; - if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_active) + if (!lov->lov_tgts[osts->op_array[i]] || + !lov->lov_tgts[osts->op_array[i]]->ltd_active) continue; - bavail = TGT_BAVAIL(i); + bavail = TGT_BAVAIL(osts->op_array[i]); if (!bavail) { if (warn) { CDEBUG(D_QOS, "no free space on %s\n", - obd_uuid2str(&lov->lov_tgts[i]->ltd_uuid)); + obd_uuid2str(&lov->lov_tgts[osts->op_array[i]]->ltd_uuid)); last_warn = now; } continue; } - if (!TGT_FFREE(i)) { + if (!TGT_FFREE(osts->op_array[i])) { if (warn) { CDEBUG(D_QOS, "no free inodes on %s\n", - obd_uuid2str(&lov->lov_tgts[i]->ltd_uuid)); + obd_uuid2str(&lov->lov_tgts[osts->op_array[i]]->ltd_uuid)); last_warn = now; } continue; @@ -730,20 +794,24 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, /* Fail Check before osc_precreate() is called so we can only 'fail' single OSC. */ - if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && i == 0) + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && osts->op_array[i] == 0) continue; - if (obd_precreate(lov->lov_tgts[i]->ltd_exp) > 2) + if (obd_precreate(lov->lov_tgts[osts->op_array[i]]->ltd_exp) > 2) continue; - lov->lov_tgts[i]->ltd_qos.ltq_usable = 1; - qos_calc_weight(lov, i); + lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_usable = 1; + qos_calc_weight(lov, osts->op_array[i]); total_bavail += bavail; - total_weight += lov->lov_tgts[i]->ltd_qos.ltq_weight; + total_weight += lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_weight; good_osts++; } +#ifdef QOS_DEBUG + CDEBUG(D_QOS, "found %d good osts\n", good_osts); +#endif + if (good_osts < stripe_cnt_min) GOTO(out, rc = -EAGAIN); @@ -792,19 +860,24 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, /* On average, this will hit larger-weighted osts more often. 0-weight osts will always get used last (only when rand=0).*/ - for (i = 0; i < ost_count; i++) { - if (!lov->lov_tgts[i] || - !lov->lov_tgts[i]->ltd_qos.ltq_usable) + for (i = 0; i < osts->op_count; i++) { + if (!lov->lov_tgts[osts->op_array[i]] || + !lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_usable) continue; - cur_weight += lov->lov_tgts[i]->ltd_qos.ltq_weight; + cur_weight += lov->lov_tgts[osts->op_array[i]]->ltd_qos.ltq_weight; +#ifdef QOS_DEBUG + CDEBUG(D_QOS, "stripe_cnt=%d nfound=%d cur_weight="LPU64 + " rand="LPU64" total_weight="LPU64"\n", + *stripe_cnt, nfound, cur_weight, rand, total_weight); +#endif if (cur_weight >= rand) { #ifdef QOS_DEBUG CDEBUG(D_QOS, "assigned stripe=%d to idx=%d\n", - nfound, i); + nfound, osts->op_array[i]); #endif - idx_arr[nfound++] = i; - qos_used(lov, i, &total_weight); + idx_arr[nfound++] = osts->op_array[i]; + qos_used(lov, osts, osts->op_array[i], &total_weight); rc = 0; break; } @@ -818,11 +891,14 @@ static int alloc_qos(struct obd_export *exp, int *idx_arr, int *stripe_cnt, LASSERT(nfound == *stripe_cnt); out: + if (pool != NULL) + read_unlock(&pool_tgt_rwlock(pool)); + up_write(&lov->lov_qos.lq_rw_sem); out_nolock: if (rc == -EAGAIN) - rc = alloc_rr(lov, idx_arr, stripe_cnt, flags); + rc = alloc_rr(lov, idx_arr, stripe_cnt, poolname, flags); lov_putref(exp->exp_obd); RETURN(rc); @@ -847,7 +923,8 @@ static int alloc_idx_array(struct obd_export *exp, struct lov_stripe_md *lsm, if (newea || lsm->lsm_oinfo[0]->loi_ost_idx >= lov->desc.ld_tgt_count) - rc = alloc_qos(exp, tmp_arr, &stripe_cnt, flags); + rc = alloc_qos(exp, tmp_arr, &stripe_cnt, + lsm->lsm_pool_name, flags); else rc = alloc_specific(lov, lsm, tmp_arr); diff --git a/lustre/mdd/mdd_lov.c b/lustre/mdd/mdd_lov.c index 182a8b7..d05c902 100644 --- a/lustre/mdd/mdd_lov.c +++ b/lustre/mdd/mdd_lov.c @@ -265,12 +265,11 @@ static int mdd_lov_set_dir_md(const struct lu_env *env, LASSERT(S_ISDIR(mdd_object_type(obj))); lum = (struct lov_user_md*)buf->lb_buf; - /* if { size, offset, count } = { 0, -1, 0 } (i.e. all default + /* if { size, offset, count } = { 0, -1, 0 } and no pool (i.e. all default * values specified) then delete default striping from dir. */ - if ((lum->lmm_stripe_size == 0 && lum->lmm_stripe_count == 0 && - lum->lmm_stripe_offset == (typeof(lum->lmm_stripe_offset))(-1)) || - /* lmm_stripe_size == -1 is deprecated in 1.4.6 */ - lum->lmm_stripe_size == (typeof(lum->lmm_stripe_size))(-1)){ + if (lum->lmm_stripe_size == 0 && lum->lmm_stripe_count == 0 && + lum->lmm_stripe_offset == (typeof(lum->lmm_stripe_offset))(-1) && + lum->lmm_magic != LOV_USER_MAGIC_V3) { rc = mdd_xattr_set_txn(env, obj, &LU_BUF_NULL, MDS_LOV_MD_NAME, 0, handle); if (rc == -ENODATA) @@ -324,7 +323,7 @@ int mdd_lov_set_md(const struct lu_env *env, struct mdd_object *pobj, if (lmmp == NULL && lmm_size == 0) { struct mdd_device *mdd = mdd_obj2mdd_dev(child); struct lov_mds_md *lmm = mdd_max_lmm_get(env, mdd); - int size = sizeof(*lmm); + int size = sizeof(struct lov_mds_md_v3); /* Get parent dir stripe and set */ if (pobj != NULL) @@ -362,15 +361,21 @@ static void mdd_lov_update_objids(struct obd_device *obd, struct lov_mds_md *lmm { struct mds_obd *mds = &obd->u.mds; int j; + struct lov_ost_data_v1 *lmm_objects; ENTRY; /* if we create file without objects - lmm is NULL */ if (lmm == NULL) return; + if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3) + lmm_objects = ((struct lov_mds_md_v3 *)lmm)->lmm_objects; + else + lmm_objects = lmm->lmm_objects; + for (j = 0; j < le32_to_cpu(lmm->lmm_stripe_count); j++) { - int i = le32_to_cpu(lmm->lmm_objects[j].l_ost_idx); - obd_id id = le64_to_cpu(lmm->lmm_objects[j].l_object_id); + int i = le32_to_cpu(lmm_objects[j].l_ost_idx); + obd_id id = le64_to_cpu(lmm_objects[j].l_object_id); int page = i / OBJID_PER_PAGE(); int idx = i % OBJID_PER_PAGE(); obd_id *data = mds->mds_lov_page_array[page]; diff --git a/lustre/mdd/mdd_trans.c b/lustre/mdd/mdd_trans.c index 743762f..01ab561 100644 --- a/lustre/mdd/mdd_trans.c +++ b/lustre/mdd/mdd_trans.c @@ -121,7 +121,10 @@ int mdd_log_txn_param_build(const struct lu_env *env, struct md_object *obj, if (rc || !(ma->ma_valid & MA_LOV)) RETURN(rc); - LASSERT(le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC); + LASSERTF(le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V1 || + le32_to_cpu(ma->ma_lmm->lmm_magic) == LOV_MAGIC_V3, + "%08x", le32_to_cpu(ma->ma_lmm->lmm_magic)); + if ((int)le32_to_cpu(ma->ma_lmm->lmm_stripe_count) < 0) stripe = mdd2obd_dev(mdd)->u.mds.mds_lov_desc.ld_tgt_count; else diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 2c37cc5..ee256b2 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -410,7 +410,7 @@ static int mds_cmd_setup(struct obd_device *obd, struct lustre_cfg *lcfg) if (rc) GOTO(err_objects, rc); - mds->mds_max_mdsize = sizeof(struct lov_mds_md); + mds->mds_max_mdsize = sizeof(struct lov_mds_md_v3); mds->mds_max_cookiesize = sizeof(struct llog_cookie); err_pop: diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index 4d30fe3..9cf0e71 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -64,7 +64,8 @@ int mds_post_mds_lovconf(struct obd_device *obd); int mds_notify(struct obd_device *obd, struct obd_device *watched, enum obd_notify_event ev, void *data); int mds_convert_lov_ea(struct obd_device *obd, struct inode *inode, - struct lov_mds_md *lmm, int lmm_size); + struct lov_mds_md *lmm, int lmm_size, + __u64 connect_flags); int mds_init_lov_desc(struct obd_device *obd, struct obd_export *osc_exp); int mds_obd_create(struct obd_export *exp, struct obdo *oa, diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 1f0f995..07444ac 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -367,7 +367,7 @@ static int mds_lov_update_desc(struct obd_device *obd, struct obd_export *lov) stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT, mds->mds_lov_desc.ld_tgt_count); - mds->mds_max_mdsize = lov_mds_md_size(stripes); + mds->mds_max_mdsize = lov_mds_md_size(stripes, LOV_MAGIC_V3); mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie); CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize for %d stripes: " "%d/%d\n", mds->mds_max_mdsize, mds->mds_max_cookiesize, diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c index 3208572..1bd26d6 100644 --- a/lustre/mgs/mgs_handler.c +++ b/lustre/mgs/mgs_handler.c @@ -357,6 +357,21 @@ static int mgs_put_cfg_lock(struct lustre_handle *lockh) RETURN(0); } +static void mgs_revoke_lock(struct obd_device *obd, char *fsname, + struct lustre_handle *lockh) +{ + int lockrc; + + if (fsname[0]) { + lockrc = mgs_get_cfg_lock(obd, fsname, lockh); + if (lockrc != ELDLM_OK) + CERROR("lock error %d for fs %s\n", lockrc, + fsname); + else + mgs_put_cfg_lock(lockh); + } +} + /* rc=0 means ok 1 means update <0 means error */ @@ -508,7 +523,7 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req) struct obd_device *obd = req->rq_export->exp_obd; struct mgs_send_param *msp, *rep_msp; struct lustre_handle lockh; - int lockrc, rc; + int rc; struct lustre_cfg_bufs bufs; struct lustre_cfg *lcfg; char fsname[MTI_NAME_MAXLEN]; @@ -528,19 +543,9 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req) RETURN(rc); } - /* Revoke lock so everyone updates. Should be alright if - * someone was already reading while we were updating the logs, - * so we don't really need to hold the lock while we're - * writing. - */ - if (fsname[0]) { - lockrc = mgs_get_cfg_lock(obd, fsname, &lockh); - if (lockrc != ELDLM_OK) - CERROR("lock error %d for fs %s\n", lockrc, - fsname); - else - mgs_put_cfg_lock(&lockh); - } + /* request for update */ + mgs_revoke_lock(obd, fsname, &lockh); + lustre_cfg_free(lcfg); rc = req_capsule_server_pack(&req->rq_pill); @@ -709,6 +714,134 @@ static inline int mgs_destroy_export(struct obd_export *exp) RETURN(0); } +static int mgs_extract_fs_pool(char * arg, char *fsname, char *poolname) +{ + char *ptr; + + ENTRY; + for (ptr = arg; (*ptr != '\0') && (*ptr != '.'); ptr++ ) { + *fsname = *ptr; + fsname++; + } + if (*ptr == '\0') + return -EINVAL; + *fsname = '\0'; + ptr++; + strcpy(poolname, ptr); + + RETURN(0); +} + +static int mgs_iocontrol_pool(struct obd_device *obd, + struct obd_ioctl_data *data) +{ + int rc; + struct lustre_handle lockh; + struct lustre_cfg *lcfg = NULL; + struct llog_rec_hdr rec; + char *fsname = NULL; + char *poolname = NULL; + ENTRY; + + OBD_ALLOC(fsname, MTI_NAME_MAXLEN); + if (fsname == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC(poolname, MAXPOOLNAME + 1); + if (poolname == NULL) { + rc = -ENOMEM; + GOTO(out_pool, rc); + } + rec.lrh_len = llog_data_len(data->ioc_plen1); + + if (data->ioc_type == LUSTRE_CFG_TYPE) { + rec.lrh_type = OBD_CFG_REC; + } else { + CERROR("unknown cfg record type:%d \n", data->ioc_type); + rc = -EINVAL; + GOTO(out_pool, rc); + } + + if (data->ioc_plen1 > CFS_PAGE_SIZE) { + rc = -E2BIG; + GOTO(out_pool, rc); + } + + OBD_ALLOC(lcfg, data->ioc_plen1); + if (lcfg == NULL) { + rc = -ENOMEM; + GOTO(out_pool, rc); + } + rc = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1); + if (rc) + GOTO(out_pool, rc); + + if (lcfg->lcfg_bufcount < 2) { + rc = -EINVAL; + GOTO(out_pool, rc); + } + + /* first arg is always . */ + mgs_extract_fs_pool(lustre_cfg_string(lcfg, 1), fsname, + poolname); + + switch (lcfg->lcfg_command) { + case LCFG_POOL_NEW: { + if (lcfg->lcfg_bufcount != 2) + RETURN(-EINVAL); + rc = mgs_pool_cmd(obd, LCFG_POOL_NEW, fsname, + poolname, NULL); + break; + } + case LCFG_POOL_ADD: { + if (lcfg->lcfg_bufcount != 3) + RETURN(-EINVAL); + rc = mgs_pool_cmd(obd, LCFG_POOL_ADD, fsname, poolname, + lustre_cfg_string(lcfg, 2)); + break; + } + case LCFG_POOL_REM: { + if (lcfg->lcfg_bufcount != 3) + RETURN(-EINVAL); + rc = mgs_pool_cmd(obd, LCFG_POOL_REM, fsname, poolname, + lustre_cfg_string(lcfg, 2)); + break; + } + case LCFG_POOL_DEL: { + if (lcfg->lcfg_bufcount != 2) + RETURN(-EINVAL); + rc = mgs_pool_cmd(obd, LCFG_POOL_DEL, fsname, + poolname, NULL); + break; + } + default: { + rc = -EINVAL; + GOTO(out_pool, rc); + } + } + + if (rc) { + CERROR("OBD_IOC_POOL err %d, cmd %X for pool %s.%s\n", + rc, lcfg->lcfg_command, fsname, poolname); + GOTO(out_pool, rc); + } + + /* request for update */ + mgs_revoke_lock(obd, fsname, &lockh); + +out_pool: + if (lcfg != NULL) + OBD_FREE(lcfg, data->ioc_plen1); + + if (fsname != NULL) + OBD_FREE(fsname, MTI_NAME_MAXLEN); + + if (poolname != NULL) + OBD_FREE(poolname, MAXPOOLNAME + 1); + + RETURN(rc); +} + /* from mdt_iocontrol */ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void *uarg) @@ -728,7 +861,6 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len, struct lustre_cfg *lcfg; struct llog_rec_hdr rec; char fsname[MTI_NAME_MAXLEN]; - int lockrc; rec.lrh_len = llog_data_len(data->ioc_plen1); @@ -759,20 +891,17 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len, someone was already reading while we were updating the logs, so we don't really need to hold the lock while we're writing (above). */ - if (fsname[0]) { - lockrc = mgs_get_cfg_lock(obd, fsname, &lockh); - if (lockrc != ELDLM_OK) - CERROR("lock error %d for fs %s\n", lockrc, - fsname); - else - mgs_put_cfg_lock(&lockh); - } + mgs_revoke_lock(obd, fsname, &lockh); out_free: OBD_FREE(lcfg, data->ioc_plen1); RETURN(rc); } + case OBD_IOC_POOL: { + RETURN(mgs_iocontrol_pool(obd, data)); + } + case OBD_IOC_DUMP_LOG: { struct llog_ctxt *ctxt; ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); diff --git a/lustre/mgs/mgs_internal.h b/lustre/mgs/mgs_internal.h index c0620a1..d5d18bd 100644 --- a/lustre/mgs/mgs_internal.h +++ b/lustre/mgs/mgs_internal.h @@ -95,6 +95,9 @@ int mgs_erase_log(struct obd_device *obd, char *name); int mgs_erase_logs(struct obd_device *obd, char *fsname); int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname); +int mgs_pool_cmd(struct obd_device *obd, enum lcfg_command_type cmd, + char *poolname, char *fsname, char *ostname); + /* mgs_fs.c */ int mgs_fs_setup(struct obd_device *obd, struct vfsmount *mnt); int mgs_fs_cleanup(struct obd_device *obddev); diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c index 8c3f77d..b621ca6 100644 --- a/lustre/mgs/mgs_llog.c +++ b/lustre/mgs/mgs_llog.c @@ -3235,6 +3235,145 @@ out: RETURN(rc); } +static int mgs_write_log_pool(struct obd_device *obd, char *logname, struct fs_db *fsdb, + char *lovname, + enum lcfg_command_type cmd, + char *poolname, char *fsname, + char *ostname, char *comment) +{ + struct llog_handle *llh = NULL; + int rc; + + rc = record_start_log(obd, &llh, logname); + if (rc) + RETURN(rc); + rc = record_marker(obd, llh, fsdb, CM_START, lovname, comment); + record_base(obd, llh, lovname, 0, cmd, poolname, fsname, ostname, 0); + rc = record_marker(obd, llh, fsdb, CM_END, lovname, comment); + rc = record_end_log(obd, &llh); + + return(rc); +} + +int mgs_pool_cmd(struct obd_device *obd, enum lcfg_command_type cmd, + char *fsname, char *poolname, char *ostname) +{ + struct fs_db *fsdb; + char mdt_index[16]; + char *lovname; + char *logname; + char *label, *canceled_label = NULL; + int label_sz; + struct mgs_target_info *mti; + int rc; + ENTRY; + + rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); + if (rc) { + CERROR("Can't get db for %s\n", fsname); + RETURN(rc); + } + if (fsdb->fsdb_flags & FSDB_LOG_EMPTY) { + CERROR("%s is not defined\n", fsname); + mgs_free_fsdb(obd, fsdb); + RETURN(-EINVAL); + } + + label_sz = 10 + strlen(fsname) + strlen(poolname); + + /* check if ostname match fsname */ + if (ostname != NULL) { + char *ptr; + + ptr = strrchr(ostname, '-'); + if ((ptr == NULL) || + (strncmp(fsname, ostname, ptr-ostname) != 0)) + RETURN(-EINVAL); + label_sz += strlen(ostname); + } + + OBD_ALLOC(label, label_sz); + if (label == NULL) + RETURN(-ENOMEM); + + switch(cmd) { + case LCFG_POOL_NEW: { + sprintf(label, + "new %s.%s", fsname, poolname); + break; + } + case LCFG_POOL_ADD: { + sprintf(label, + "add %s.%s.%s", fsname, poolname, ostname); + break; + } + case LCFG_POOL_REM: { + OBD_ALLOC(canceled_label, label_sz); + if (canceled_label == NULL) + RETURN(-ENOMEM); + sprintf(label, + "rem %s.%s.%s", fsname, poolname, ostname); + sprintf(canceled_label, + "add %s.%s.%s", fsname, poolname, ostname); + break; + } + case LCFG_POOL_DEL: { + OBD_ALLOC(canceled_label, label_sz); + if (canceled_label == NULL) + RETURN(-ENOMEM); + sprintf(label, + "del %s.%s", fsname, poolname); + sprintf(canceled_label, + "new %s.%s", fsname, poolname); + break; + } + default: { + break; + } + } + + down(&fsdb->fsdb_sem); + + sprintf(mdt_index, "-MDT%04x", 0); + name_create(&logname, fsname, mdt_index); + name_create(&lovname, logname, "-mdtlov"); + + mti = NULL; + if (canceled_label != NULL) { + OBD_ALLOC(mti, sizeof(*mti)); + if (mti != NULL) { + strcpy(mti->mti_svname, "lov pool"); + mgs_modify(obd, fsdb, mti, logname, lovname, + canceled_label, CM_SKIP); + } + } + + mgs_write_log_pool(obd, logname, fsdb, lovname, + cmd, fsname, poolname, ostname, label); + name_destroy(&logname); + + name_create(&logname, fsname, "-client"); + if (canceled_label != NULL) { + mgs_modify(obd, fsdb, mti, logname, lovname, + canceled_label, CM_SKIP); + } + mgs_write_log_pool(obd, logname, fsdb, fsdb->fsdb_clilov, + cmd, fsname, poolname, ostname, label); + name_destroy(&logname); + name_destroy(&lovname); + + up(&fsdb->fsdb_sem); + + OBD_FREE(label, label_sz); + if (canceled_label != NULL) + OBD_FREE(canceled_label, label_sz); + + if (mti != NULL) + OBD_FREE(mti, sizeof(*mti)); + + RETURN(rc); +} + #if 0 /******************** unused *********************/ static int mgs_backup_llog(struct obd_device *obd, char* fsname) diff --git a/lustre/obdclass/debug.c b/lustre/obdclass/debug.c index 1f4d99b..ed5faa3 100644 --- a/lustre/obdclass/debug.c +++ b/lustre/obdclass/debug.c @@ -117,9 +117,10 @@ int dump_obdo(struct obdo *oa) void dump_lsm(int level, struct lov_stripe_md *lsm) { CDEBUG(level, "lsm %p, objid "LPX64", maxbytes "LPX64", magic 0x%08X, " - "stripe_size %u, stripe_count %u\n", lsm, + "stripe_size %u, stripe_count %u pool "POOLNAMEF"\n", lsm, lsm->lsm_object_id, lsm->lsm_maxbytes, lsm->lsm_magic, - lsm->lsm_stripe_size, lsm->lsm_stripe_count); + lsm->lsm_stripe_size, lsm->lsm_stripe_count, + lsm->lsm_pool_name); } /* XXX assumes only a single page in request */ diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index a0e0912..79eb987 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -111,28 +111,35 @@ static int lprocfs_obd_snprintf(char **page, int end, int *len, return n; } -int lprocfs_add_simple(struct proc_dir_entry *root, char *name, - read_proc_t *read_proc, write_proc_t *write_proc, - void *data) -{ - struct proc_dir_entry *proc; +cfs_proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root, + char *name, + read_proc_t *read_proc, + write_proc_t *write_proc, + void *data, + struct file_operations *fops) +{ + cfs_proc_dir_entry_t *proc; mode_t mode = 0; if (root == NULL || name == NULL) - return -EINVAL; + return ERR_PTR(-EINVAL); if (read_proc) mode = 0444; if (write_proc) mode |= 0200; + if (fops) + mode = 0644; proc = create_proc_entry(name, mode, root); if (!proc) { CERROR("LprocFS: No memory to create /proc entry %s", name); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } proc->read_proc = read_proc; proc->write_proc = write_proc; proc->data = data; - return 0; + if (fops) + proc->proc_fops = fops; + return proc; } struct proc_dir_entry *lprocfs_add_symlink(const char *name, @@ -730,6 +737,8 @@ static const char *obd_connect_names[] = { "change_qunit_size", "alt_checksum_algorithm", "fid_is_enabled", + "version_recovery", + "pools", NULL }; @@ -1207,6 +1216,10 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) LPROCFS_OBD_OP_INIT(num_private_stats,stats,unregister_page_removal_cb); LPROCFS_OBD_OP_INIT(num_private_stats, stats, register_lock_cancel_cb); LPROCFS_OBD_OP_INIT(num_private_stats, stats,unregister_lock_cancel_cb); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del); } int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) @@ -1488,6 +1501,7 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid) int rc = 0; struct nid_stat *tmp = NULL, *tmp1; struct obd_device *obd = NULL; + cfs_proc_dir_entry_t *entry; ENTRY; *newnid = 0; @@ -1538,15 +1552,19 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid) GOTO(destroy_new, rc = -ENOMEM); } - rc = lprocfs_add_simple(tmp->nid_proc, "uuid", - lprocfs_exp_rd_uuid, NULL, tmp); - if (rc) + entry = lprocfs_add_simple(tmp->nid_proc, "uuid", + lprocfs_exp_rd_uuid, NULL, tmp, NULL); + if (IS_ERR(entry)) { CWARN("Error adding the uuid file\n"); + rc = PTR_ERR(entry); + } - rc = lprocfs_add_simple(tmp->nid_proc, "hash", - lprocfs_exp_rd_hash, NULL, tmp); - if (rc) + entry = lprocfs_add_simple(tmp->nid_proc, "hash", + lprocfs_exp_rd_hash, NULL, tmp, NULL); + if (IS_ERR(entry)) { CWARN("Error adding the hash file\n"); + rc = PTR_ERR(entry); + } exp->exp_nid_stats = tmp; *newnid = 1; diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index fbc8a8d..d375ab7a 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -825,6 +825,28 @@ int class_process_config(struct lustre_cfg *lcfg) err = class_del_conn(obd, lcfg); GOTO(out, err = 0); } + case LCFG_POOL_NEW: { + err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + break; + } + case LCFG_POOL_ADD: { + err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + break; + } + case LCFG_POOL_REM: { + err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + break; + } + case LCFG_POOL_DEL: { + err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + break; + } default: { err = obd_process_config(obd, sizeof(*lcfg), lcfg); GOTO(out, err); diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 49f209e..3c7c88f 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -2151,7 +2151,7 @@ static int filter_setup(struct obd_device *obd, struct lustre_cfg* lcfg) if (obd->obd_proc_exports_entry) lprocfs_add_simple(obd->obd_proc_exports_entry, "clear", lprocfs_nid_stats_clear_read, - lprocfs_nid_stats_clear_write, obd); + lprocfs_nid_stats_clear_write, obd, NULL); memcpy((void *)addr, lustre_cfg_buf(lcfg, 4), LUSTRE_CFG_BUFLEN(lcfg, 4)); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index b79cfd4..8e1e2d5 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3466,29 +3466,45 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs, */ static int osc_getstripe(struct lov_stripe_md *lsm, struct lov_user_md *lump) { - struct lov_user_md lum, *lumk; + /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ + struct lov_user_md_v3 lum, *lumk; + struct lov_user_ost_data_v1 *lmm_objects; int rc = 0, lum_size; ENTRY; if (!lsm) RETURN(-ENODATA); - if (copy_from_user(&lum, lump, sizeof(lum))) + /* we only need the header part from user space to get lmm_magic and + * lmm_stripe_count, (the header part is common to v1 and v3) */ + lum_size = sizeof(struct lov_user_md_v1); + if (copy_from_user(&lum, lump, lum_size)) RETURN(-EFAULT); - if (lum.lmm_magic != LOV_USER_MAGIC) + if ((lum.lmm_magic != LOV_USER_MAGIC_V1) && + (lum.lmm_magic != LOV_USER_MAGIC_V3)) RETURN(-EINVAL); + /* lov_user_md_vX and lov_mds_md_vX must have the same size */ + LASSERT(sizeof(struct lov_user_md_v1) == sizeof(struct lov_mds_md_v1)); + LASSERT(sizeof(struct lov_user_md_v3) == sizeof(struct lov_mds_md_v3)); + LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lumk->lmm_objects[0])); + + /* we can use lov_mds_md_size() to compute lum_size + * because lov_user_md_vX and lov_mds_md_vX have the same size */ if (lum.lmm_stripe_count > 0) { - lum_size = sizeof(lum) + sizeof(lum.lmm_objects[0]); + lum_size = lov_mds_md_size(lum.lmm_stripe_count, lum.lmm_magic); OBD_ALLOC(lumk, lum_size); if (!lumk) RETURN(-ENOMEM); - lumk->lmm_objects[0].l_object_id = lsm->lsm_object_id; - lumk->lmm_objects[0].l_object_gr = lsm->lsm_object_gr; + if (lum.lmm_magic == LOV_USER_MAGIC_V1) + lmm_objects = &(((struct lov_user_md_v1 *)lumk)->lmm_objects[0]); + else + lmm_objects = &(lumk->lmm_objects[0]); + lmm_objects->l_object_id = lsm->lsm_object_id; } else { - lum_size = sizeof(lum); + lum_size = lov_mds_md_size(0, lum.lmm_magic); lumk = &lum; } diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 7df1357..97ea280 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -1955,10 +1955,9 @@ static void print_lum (struct lov_user_md *lum) CDEBUG(D_OTHER, "\tlmm_stripe_offset: %#x\n", lum->lmm_stripe_offset); } -void lustre_swab_lov_user_md(struct lov_user_md *lum) +static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum) { ENTRY; - CDEBUG(D_IOCTL, "swabbing lov_user_md\n"); __swab32s(&lum->lmm_magic); __swab32s(&lum->lmm_pattern); __swab64s(&lum->lmm_object_id); @@ -1982,6 +1981,23 @@ static void print_lumj (struct lov_user_md_join *lumj) CDEBUG(D_OTHER, "\tlmm_extent_count: %#x\n", lumj->lmm_extent_count); } +void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n"); + lustre_swab_lov_user_md_common(lum); + EXIT; +} + +void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum) +{ + ENTRY; + CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n"); + lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum); + /* lmm_pool_name nothing to do with char */ + EXIT; +} + void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj) { ENTRY; @@ -1997,63 +2013,20 @@ void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj) EXIT; } -static void print_lum_objs(struct lov_user_md *lum) +void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count) { - struct lov_user_ost_data *lod; int i; ENTRY; - if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */ - return; - CDEBUG(D_OTHER, "lov_user_md_objects: %p\n", lum); - for (i = 0; i < lum->lmm_stripe_count; i++) { - lod = &lum->lmm_objects[i]; - CDEBUG(D_OTHER, "(%i) lod->l_object_id: "LPX64"\n", i, lod->l_object_id); - CDEBUG(D_OTHER, "(%i) lod->l_object_gr: "LPX64"\n", i, lod->l_object_gr); - CDEBUG(D_OTHER, "(%i) lod->l_ost_gen: %#x\n", i, lod->l_ost_gen); - CDEBUG(D_OTHER, "(%i) lod->l_ost_idx: %#x\n", i, lod->l_ost_idx); + for (i = 0; i < stripe_count; i++) { + __swab64s(&(lod[i].l_object_id)); + __swab64s(&(lod[i].l_object_gr)); + __swab32s(&(lod[i].l_ost_gen)); + __swab32s(&(lod[i].l_ost_idx)); } EXIT; } -void lustre_swab_lov_user_md_objects(struct lov_user_md *lum) -{ - struct lov_user_ost_data *lod; - int i; - ENTRY; - for (i = 0; i < lum->lmm_stripe_count; i++) { - lod = &lum->lmm_objects[i]; - __swab64s(&lod->l_object_id); - __swab64s(&lod->l_object_gr); - __swab32s(&lod->l_ost_gen); - __swab32s(&lod->l_ost_idx); - } - print_lum_objs(lum); - EXIT; -} - - -void lustre_swab_lov_mds_md(struct lov_mds_md *lmm) -{ - struct lov_ost_data *lod; - int i; - ENTRY; - for (i = 0; i < lmm->lmm_stripe_count; i++) { - lod = &lmm->lmm_objects[i]; - __swab64s(&lod->l_object_id); - __swab64s(&lod->l_object_gr); - __swab32s(&lod->l_ost_gen); - __swab32s(&lod->l_ost_idx); - } - __swab32s(&lmm->lmm_magic); - __swab32s(&lmm->lmm_pattern); - __swab64s(&lmm->lmm_object_id); - __swab64s(&lmm->lmm_object_gr); - __swab32s(&lmm->lmm_stripe_size); - __swab32s(&lmm->lmm_stripe_count); - - EXIT; -} - void lustre_swab_ldlm_res_id (struct ldlm_res_id *id) { diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 5d7d35e..76e0727 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -252,8 +252,8 @@ EXPORT_SYMBOL(lustre_swab_mds_rec_unlink); EXPORT_SYMBOL(lustre_swab_mds_rec_rename); EXPORT_SYMBOL(lustre_swab_mdt_rec_reint); EXPORT_SYMBOL(lustre_swab_lov_desc); -EXPORT_SYMBOL(lustre_swab_lov_user_md); -EXPORT_SYMBOL(lustre_swab_lov_mds_md); +EXPORT_SYMBOL(lustre_swab_lov_user_md_v1); +EXPORT_SYMBOL(lustre_swab_lov_user_md_v3); EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); EXPORT_SYMBOL(lustre_swab_lov_user_md_join); EXPORT_SYMBOL(lustre_swab_ldlm_res_id); diff --git a/lustre/tests/ll_dirstripe_verify.c b/lustre/tests/ll_dirstripe_verify.c index 2c92091..8239298 100644 --- a/lustre/tests/ll_dirstripe_verify.c +++ b/lustre/tests/ll_dirstripe_verify.c @@ -240,7 +240,7 @@ int main(int argc, char **argv) return rc; } - lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT); + lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC); if ((lum_dir = (struct lov_user_md *)malloc(lum_size)) == NULL) { rc = ENOMEM; llapi_err(LLAPI_MSG_ERROR, "error: can't allocate %d bytes " diff --git a/lustre/tests/ll_getstripe_info.c b/lustre/tests/ll_getstripe_info.c index 4b61102..ca5093c 100644 --- a/lustre/tests/ll_getstripe_info.c +++ b/lustre/tests/ll_getstripe_info.c @@ -64,7 +64,7 @@ int main(int argc, char** argv) return 1; } - lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT); + lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC); if ((lum_file = (struct lov_user_md *)malloc(lum_size)) == NULL) { fprintf(stderr, "unable to allocate memory for ioctl's"); diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh index b3024aa..82217bc 100755 --- a/lustre/tests/llmount.sh +++ b/lustre/tests/llmount.sh @@ -7,5 +7,6 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +[ -n "$LOAD" ] && load_modules && exit 0 [ -z "$NOFORMAT" ] && formatall [ -z "$NOSETUP" ] && setupall diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 7cd617f..8232094 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -3540,6 +3540,7 @@ test_99a() { run_test 99a "cvs init =========================================" test_99b() { + [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return [ ! -d $DIR/d99cvsroot ] && test_99a cd /etc/init.d # some versions of cvs import exit(1) when asked to import links or @@ -3552,6 +3553,7 @@ test_99b() { run_test 99b "cvs import =======================================" test_99c() { + [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return [ ! -d $DIR/d99cvsroot ] && test_99b cd $DIR mkdir -p $DIR/d99reposname @@ -3561,6 +3563,7 @@ test_99c() { run_test 99c "cvs checkout =====================================" test_99d() { + [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return [ ! -d $DIR/d99cvsroot ] && test_99c cd $DIR/d99reposname $RUNAS touch foo99 @@ -3569,6 +3572,7 @@ test_99d() { run_test 99d "cvs add ==========================================" test_99e() { + [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return [ ! -d $DIR/d99cvsroot ] && test_99c cd $DIR/d99reposname $RUNAS cvs update @@ -3576,6 +3580,7 @@ test_99e() { run_test 99e "cvs update =======================================" test_99f() { + [ -z "$(which cvs 2>/dev/null)" ] && skip "could not find cvs" && return [ ! -d $DIR/d99cvsroot ] && test_99d cd $DIR/d99reposname $RUNAS cvs commit -m 'nomsg' foo99 @@ -4955,8 +4960,10 @@ test_121() { #bug #10589 run_test 121 "read cancel race =========" test_123a() { # was test 123, statahead(bug 11401) + SLOWOK=0 if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then log "testing on UP system. Performance may be not as good as expected." + SLOWOK=1 fi remount_client $MOUNT @@ -5011,7 +5018,7 @@ test_123a() { # was test 123, statahead(bug 11401) lctl get_param -n llite.*.statahead_stats # wait for commitment of removal sleep 2 - [ $error -ne 0 ] && error "statahead is slow!" + [ $error -ne 0 -a $SLOWOK -eq 0 ] && error "statahead is slow!" return 0 } run_test 123a "verify statahead work" @@ -5496,6 +5503,123 @@ test_130e() { } run_test 130e "FIEMAP (test continuation FIEMAP calls)" +POOL=${POOL:-cea1} +TGT_COUNT=$OSTCOUNT +TGTPOOL_FIRST=1 +TGTPOOL_MAX=$(($TGT_COUNT - 1)) +TGTPOOL_STEP=2 +TGTPOOL_LIST=`seq $TGTPOOL_FIRST $TGTPOOL_STEP $TGTPOOL_MAX` +POOL_ROOT=${POOL_ROOT:-$DIR/d200.pools} +POOL_DIR=$POOL_ROOT/dir_tst +POOL_FILE=$POOL_ROOT/file_tst + +check_file_in_pool() +{ + file=$1 + res=$($GETSTRIPE $file | grep 0x | cut -f2) + for i in $res + do + found=$(echo :$TGTPOOL_LIST: | tr " " ":" | grep :$i:) + if [[ "$found" == "" ]] + then + echo "pool list: $TGTPOOL_LIST" + echo "striping: $res" + error "$file not allocated in $POOL" + return 1 + fi + done + return 0 +} + +test_200() { + do_facet mgs $LCTL pool_new $FSNAME.$POOL + do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL + [ $? == 0 ] || error "Pool creation of $POOL failed" +} +run_test 200 "Create new pool ==========================================" + +test_201() { + TGT=$(seq -f $FSNAME-OST%04g_UUID $TGTPOOL_FIRST $TGTPOOL_STEP \ + $TGTPOOL_MAX | tr '\n' ' ') + do_facet mgs $LCTL pool_add $FSNAME.$POOL \ + $FSNAME-OST[$TGTPOOL_FIRST-$TGTPOOL_MAX/$TGTPOOL_STEP]_UUID + res=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL | sort \ + | tr '\n' ' ') + [ "$res" = "$TGT" ] || error "Pool content ($res) do not match requested ($TGT)" +} +run_test 201 "Add targets to a pool ====================================" + +test_202a() { + mkdir -p $POOL_DIR + $SETSTRIPE -c 2 -p $POOL $POOL_DIR + [ $? = 0 ] || error "Cannot set pool $POOL to $POOL_DIR" +} +run_test 202a "Set pool on a directory =================================" + +test_202b() { + res=$($GETSTRIPE $POOL_DIR | grep pool: | cut -f8 -d " ") + [ "$res" = $POOL ] || error "Pool on $POOL_DIR is not $POOL" +} +run_test 202b "Check pool on a directory ===============================" + +test_202c() { + failed=0 + for i in $(seq -w 1 $(($TGT_COUNT * 3))) + do + file=$POOL_DIR/file-$i + touch $file + check_file_in_pool $file + if [[ $? != 0 ]] + then + failed=$(($failed + 1)) + fi + done + [ "$failed" = 0 ] || error "$failed files not allocated in $POOL" +} +run_test 202c "Check files allocation from directory pool ==============" + +test_203() { + mkdir -p $POOL_FILE + failed=0 + for i in $(seq -w 1 $(($TGT_COUNT * 3))) + do + file=$POOL_FILE/spoo-$i + $SETSTRIPE -p $POOL $file + check_file_in_pool $file + if [[ $? != 0 ]] + then + failed=$(($failed + 1)) + fi + done + [ "$failed" = 0 ] || error "$failed files not allocated in $POOL" +} +run_test 203 "Create files in a pool ===================================" + +test_210a() { + TGT=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL | head -1) + do_facet mgs $LCTL pool_remove $FSNAME.$POOL $TGT + res=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL | grep $TGT) + [ "$res" = "" ] || error "$TGT not removed from $FSNAME.$POOL" +} +run_test 210a "Remove a target from a pool =============================" + +test_210b() { + for TGT in $(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL) + do + do_facet mgs $LCTL pool_remove $FSNAME.$POOL $TGT + done + res=$(do_facet mgs $LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL) + [ "$res" = "" ] || error "Pool $FSNAME.$POOL cannot be drained" +} +run_test 210b "Remove all targets from a pool ==========================" + +test_211() { + do_facet mgs $LCTL pool_destroy $FSNAME.$POOL + res=$(do_facet mgs "$LCTL get_param -n lov.$FSNAME-MDT0000-mdtlov.pools.$POOL 2>/dev/null") + [ "$res" = "" ] || error "Pool $FSNAME.$POOL is not destroyed" +} +run_test 211 "Remove a pool ============================================" + TMPDIR=$OLDTMPDIR TMP=$OLDTMP HOME=$OLDHOME diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index b692fe5..229dbf6 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -83,7 +83,7 @@ init_test_env() { export LUSTRE=`absolute_path $LUSTRE` export TESTSUITE=`basename $0 .sh` - [ -d /r ] && export ROOT=${ROOT:-/r} + #[ -d /r ] && export ROOT=${ROOT:-/r} export TMP=${TMP:-$ROOT/tmp} export TESTSUITELOG=${TMP}/${TESTSUITE}.log export HOSTNAME=${HOSTNAME:-`hostname`} @@ -506,7 +506,7 @@ zconf_mount() { do_node $client "lctl set_param debug=$PTLDEBUG; lctl set_param subsystem_debug=${SUBSYSTEM# }; lctl set_param debug_mb=${DEBUG_SIZE}" - [ -d /r ] && $LCTL modules > /r/tmp/ogdb-$HOSTNAME + return 0 } diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 19b7d41..6b2c150 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -33,7 +33,7 @@ endif # UTILS lib_LIBRARIES = liblustreapi.a libiam.a lctl_SOURCES = obd.c lustre_cfg.c lctl.c obdctl.h -lctl_LDADD := $(LIBREADLINE) $(LIBPTLCTL) +lctl_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL) lctl_DEPENDENCIES := $(LIBPTLCTL) lfs_SOURCES = lfs.c obd.c lustre_cfg.c @@ -41,7 +41,7 @@ lfs_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL) lfs_DEPENDENCIES := $(LIBPTLCTL) liblustreapi.a loadgen_SOURCES = loadgen.c lustre_cfg.c obd.c -loadgen_LDADD := $(LIBREADLINE) $(LIBPTLCTL) $(PTHREAD_LIBS) +loadgen_LDADD := $(LIBREADLINE) liblustreapi.a $(LIBPTLCTL) $(PTHREAD_LIBS) loadgen_DEPENDENCIES := $(LIBPTLCTL) if EXT2FS_DEVEL diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index a1b58f6..264ffcc 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -208,6 +208,24 @@ command_t cmdlist[] = { "get the device info of a attached file\n" "usage: blockdev_info "}, + /* Pool commands */ + {"=== Pools ==", jt_noop, 0, "pool management"}, + {"pool_new", jt_pool_cmd, 0, + "add a new pool\n" + "usage pool_new ."}, + {"pool_add", jt_pool_cmd, 0, + "add the named OSTs to the pool\n" + "usage pool_add . "}, + {"pool_remove", jt_pool_cmd, 0, + "remove the named OST from the pool\n" + "usage pool_remove . "}, + {"pool_destroy", jt_pool_cmd, 0, + "destroy a pool\n" + "usage pool_destroy ."}, + {"pool_list", jt_pool_cmd, 0, + "list pools and pools members\n" + "usage pool_list [.] | "}, + /* Test only commands */ {"==== testing (DANGEROUS) ====", jt_noop, 0, "testing (DANGEROUS)"}, {"--threads", jt_opt_threads, 0, diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 8d4246f..2479e93 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -40,6 +40,11 @@ * Author: Robert Read */ +/* for O_DIRECTORY */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include #include #include @@ -94,6 +99,7 @@ static int lfs_rsetfacl(int argc, char **argv); static int lfs_rgetfacl(int argc, char **argv); static int lfs_cp(int argc, char **argv); static int lfs_ls(int argc, char **argv); +static int lfs_poollist(int argc, char **argv); /* all avaialable commands */ command_t cmdlist[] = { @@ -101,30 +107,34 @@ command_t cmdlist[] = { "Create a new file with a specific striping pattern or\n" "set the default striping pattern on an existing directory or\n" "delete the default striping pattern from an existing directory\n" - "usage: setstripe \n" - " or \n" - " setstripe [--size|-s stripe_size]\n" - " [--index|-i stripe_index]\n" - " [--count|-c stripe_count]\n" + "usage: setstripe [--size|-s stripe_size] [--offset|-o start_ost]\n" + " [--count|-c stripe_count] [--pool|-p pool_name]\n" + " \n" " or \n" - " setstripe -d (to delete default striping)\n" + " setstripe -d (to delete default striping)\n" "\tstripe_size: Number of bytes on each OST (0 filesystem default)\n" - "\t Can be specified with k, m or g (in KB, MB and GB respectively)\n" - "\tstripe_index: OST index of first stripe (-1 filesystem default)\n" - "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)"}, + "\t Can be specified with k, m or g (in KB, MB and GB\n" + "\t respectively)\n" + "\tstart_ost: OST index of first stripe (-1 filesystem default)\n" + "\tstripe_count: Number of OSTs to stripe over (0 default, -1 all)\n" + "\tpool_name: Name of OST pool"}, {"getstripe", lfs_getstripe, 0, - "To list the striping info for a given filename or files in a\n" + "To list the striping info for a given file or files in a\n" "directory or recursively for all files in a directory tree.\n" "usage: getstripe [--obd|-O ] [--quiet | -q] [--verbose | -v]\n" " [--recursive | -r] ..."}, + {"poollist", lfs_poollist, 0, + "List pools or pool OSTs\n" + "usage: poollist [.] | \n"}, {"find", lfs_find, 0, "To find files that match given parameters recursively in a directory tree.\n" - "usage: find ... \n" + "usage: find ... \n" " [[!] --atime|-A [+-]N] [[!] --mtime|-M [+-]N] [[!] --ctime|-C [+-]N]\n" " [--maxdepth|-D N] [[!] --name|-n ] [--print0|-P]\n" " [--print|-p] [--obd|-O ] [[!] --size|-s [+-]N[bkMGTP]]\n" " [[!] --type|-t ] [[!] --gid|-g N] [[!] --group|-G ]\n" " [[!] --uid|-u N] [[!] --user|-U ]\n" + " [[!] --pool ]\n" "\t !: used before an option indicates 'NOT' the requested attribute\n" "\t -: used before an value indicates 'AT MOST' the requested value\n" "\t +: used before an option indicates 'AT LEAST' the requested value\n"}, @@ -203,12 +213,15 @@ static int lfs_setstripe(int argc, char **argv) char *stripe_size_arg = NULL; char *stripe_off_arg = NULL; char *stripe_count_arg = NULL; + char *pool_name_arg = NULL; unsigned long long size_units; struct option long_opts[] = { {"size", required_argument, 0, 's'}, {"count", required_argument, 0, 'c'}, {"index", required_argument, 0, 'i'}, + {"offset", required_argument, 0, 'o'}, + {"pool", required_argument, 0, 'p'}, {"delete", no_argument, 0, 'd'}, {0, 0, 0, 0} }; @@ -221,7 +234,7 @@ static int lfs_setstripe(int argc, char **argv) * usage */ fname = argv[2]; optind = 2; - } else if (argc == 5 && + } else if (argc == 5 && (argv[2][0] != '-' || isdigit(argv[2][1])) && (argv[3][0] != '-' || isdigit(argv[3][1])) && (argv[4][0] != '-' || isdigit(argv[4][1])) ) { @@ -234,7 +247,7 @@ static int lfs_setstripe(int argc, char **argv) optind = 4; } else { optind = 0; - while ((c = getopt_long(argc, argv, "c:di:s:", + while ((c = getopt_long(argc, argv, "c:di:o:s:p:", long_opts, NULL)) >= 0) { switch (c) { case 0: @@ -248,11 +261,15 @@ static int lfs_setstripe(int argc, char **argv) delete = 1; break; case 'i': + case 'o': stripe_off_arg = optarg; break; case 's': stripe_size_arg = optarg; break; + case 'p': + pool_name_arg = optarg; + break; case '?': return CMD_HELP; default: @@ -268,11 +285,11 @@ static int lfs_setstripe(int argc, char **argv) return CMD_HELP; - if (delete && - (stripe_size_arg != NULL || stripe_off_arg != NULL || - stripe_count_arg != NULL)) { + if (delete && + (stripe_size_arg != NULL || stripe_off_arg != NULL || + stripe_count_arg != NULL || pool_name_arg != NULL)) { fprintf(stderr, "error: %s: cannot specify -d with " - "-s, -c or -i options\n", + "-s, -c -o or -p options\n", argv[0]); return CMD_HELP; } @@ -312,7 +329,12 @@ static int lfs_setstripe(int argc, char **argv) } } - result = llapi_file_create(fname, st_size, st_offset, st_count, 0); + if (pool_name_arg == NULL) + result = llapi_file_create(fname, st_size, st_offset, st_count, 0); + else + result = llapi_file_create_pool(fname, st_size, st_offset, + st_count, 0, pool_name_arg); + if (result) fprintf(stderr, "error: %s: create stripe file failed\n", argv[0]); @@ -320,11 +342,19 @@ static int lfs_setstripe(int argc, char **argv) return result; } +static int lfs_poollist(int argc, char **argv) +{ + if (argc != 2) + return CMD_HELP; + + return llapi_poollist(argv[1]); +} + static int set_time(time_t *time, time_t *set, char *str) { time_t t; int res = 0; - + if (str[0] == '+') res = 1; else if (str[0] == '-') @@ -399,6 +429,7 @@ static int id2name(char **name, unsigned int id, int type) return 0; } +#define FIND_POOL_OPT 3 static int lfs_find(int argc, char **argv) { int new_fashion = 1; @@ -417,6 +448,8 @@ static int lfs_find(int argc, char **argv) {"uid", required_argument, 0, 'u'}, {"user", required_argument, 0, 'U'}, {"name", required_argument, 0, 'n'}, + /* no short option for pool, p/P already used */ + {"pool", required_argument, 0, FIND_POOL_OPT}, /* --obd is considered as a new option. */ {"obd", required_argument, 0, 'O'}, {"ost", required_argument, 0, 'O'}, @@ -522,8 +555,8 @@ static int lfs_find(int argc, char **argv) new_fashion = 1; param.gid = strtol(optarg, &endptr, 10); if (optarg == endptr) { - ret = name2id(¶m.gid, optarg, GRPQUOTA); - if (ret != 0) { + ret = name2id(¶m.gid, optarg, GRPQUOTA); + if (ret != 0) { fprintf(stderr, "Group/GID: %s cannot " "be found.\n", optarg); return -1; @@ -546,8 +579,8 @@ static int lfs_find(int argc, char **argv) new_fashion = 1; param.uid = strtol(optarg, &endptr, 10); if (optarg == endptr) { - ret = name2id(¶m.uid, optarg, USRQUOTA); - if (ret != 0) { + ret = name2id(¶m.uid, optarg, USRQUOTA); + if (ret != 0) { fprintf(stderr, "User/UID: %s cannot " "be found.\n", optarg); return -1; @@ -556,6 +589,22 @@ static int lfs_find(int argc, char **argv) param.exclude_uid = !!neg_opt; param.check_uid = 1; break; + case FIND_POOL_OPT: + new_fashion = 1; + if (strlen(optarg) > MAXPOOLNAME) { + fprintf(stderr, + "Pool name %s is too long" + " (max is %d)\n", optarg, + MAXPOOLNAME); + return -1; + } + /* we do check for empty pool because empty pool + * is used to find V1 lov attributes */ + strncpy(param.poolname, optarg, MAXPOOLNAME); + param.poolname[MAXPOOLNAME] = '\0'; + param.exclude_pool = !!neg_opt; + param.check_pool = 1; + break; case 'n': new_fashion = 1; param.pattern = (char *)optarg; @@ -667,7 +716,7 @@ static int lfs_find(int argc, char **argv) return CMD_HELP; }; } - + if (pathstart == -1) { fprintf(stderr, "error: %s: no filename|pathname\n", argv[0]); @@ -689,7 +738,7 @@ static int lfs_find(int argc, char **argv) if (!param.recursive && param.maxdepth == -1) param.maxdepth = 1; } - + do { if (new_fashion) ret = llapi_find(argv[pathstart], ¶m); @@ -763,7 +812,7 @@ static int lfs_getstripe(int argc, char **argv) } while (++optind < argc && !rc); if (rc) - fprintf(stderr, "error: %s failed for %s.\n", + fprintf(stderr, "error: %s failed for %s.\n", argv[0], argv[optind - 1]); return rc; } @@ -1440,7 +1489,7 @@ do { \ * 2. specifiers may be encountered multiple times (2s3s is 5 seconds) * 3. empty integer value is interpreted as 0 */ - + static unsigned long str2sec(const char* timestr) { const char spec[] = "smhdw"; const unsigned long mult[] = {1, 60, 60*60, 24*60*60, 7*24*60*60}; @@ -1462,7 +1511,7 @@ static unsigned long str2sec(const char* timestr) { v = strtoul(timestr, &tail, 10); if (v == ULONG_MAX || *tail == '\0') - /* value too large (ULONG_MAX or more) + /* value too large (ULONG_MAX or more) or missing specifier */ goto error; diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 1adf2ec..f824151 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -59,6 +59,7 @@ #include #include #include +#include #ifdef HAVE_LINUX_UNISTD_H #include #else @@ -209,61 +210,69 @@ int parse_size(char *optarg, unsigned long long *size, return 0; } -int llapi_file_open(const char *name, int flags, int mode, - unsigned long stripe_size, int stripe_offset, - int stripe_count, int stripe_pattern) +int llapi_stripe_limit_check(unsigned long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern) { - struct lov_user_md lum = { 0 }; - int fd, rc = 0; - int isdir = 0; int page_size; - fd = open(name, flags | O_LOV_DELAY_CREATE, mode); - if (fd < 0 && errno == EISDIR) { - fd = open(name, O_DIRECTORY | O_RDONLY); - isdir++; - } - - if (fd < 0) { - rc = -errno; - llapi_err(LLAPI_MSG_ERROR, "unable to open '%s'", name); - return rc; - } - /* 64 KB is the largest common page size I'm aware of (on ia64), but * check the local page size just in case. */ page_size = LOV_MIN_STRIPE_SIZE; if (getpagesize() > page_size) { page_size = getpagesize(); - llapi_err_noerrno(LLAPI_MSG_WARN, + llapi_err_noerrno(LLAPI_MSG_WARN, "warning: your page size (%u) is " - "larger than expected (%u)", page_size, + "larger than expected (%u)", page_size, LOV_MIN_STRIPE_SIZE); } if (stripe_size < 0 || (stripe_size & (LOV_MIN_STRIPE_SIZE - 1))) { - errno = rc = -EINVAL; llapi_err(LLAPI_MSG_ERROR, "error: bad stripe_size %lu, " - "must be an even multiple of %d bytes", + "must be an even multiple of %d bytes", stripe_size, page_size); - goto out; + return -EINVAL; } if (stripe_offset < -1 || stripe_offset > MAX_OBD_DEVICES) { - errno = rc = -EINVAL; - llapi_err(LLAPI_MSG_ERROR, "error: bad stripe offset %d", + llapi_err(LLAPI_MSG_ERROR, "error: bad stripe offset %d", stripe_offset); - goto out; + return -EINVAL; } if (stripe_count < -1 || stripe_count > LOV_MAX_STRIPE_COUNT) { - errno = rc = -EINVAL; - llapi_err(LLAPI_MSG_ERROR, "error: bad stripe count %d", + llapi_err(LLAPI_MSG_ERROR, "error: bad stripe count %d", stripe_count); - goto out; + return -EINVAL; } if (stripe_count > 0 && (__u64)stripe_size * stripe_count > 0xffffffff){ - errno = rc = -EINVAL; llapi_err(LLAPI_MSG_ERROR, "error: stripe_size %lu * " - "stripe_count %u exceeds 4GB", stripe_size, + "stripe_count %u exceeds 4GB", stripe_size, stripe_count); + return -EINVAL; + } + return 0; +} + +int llapi_file_open(const char *name, int flags, int mode, + unsigned long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern) +{ + struct lov_user_md lum = { 0 }; + int fd, rc = 0; + int isdir = 0; + + fd = open(name, flags | O_LOV_DELAY_CREATE, mode); + if (fd < 0 && errno == EISDIR) { + fd = open(name, O_DIRECTORY | O_RDONLY); + isdir++; + } + + if (fd < 0) { + rc = -errno; + llapi_err(LLAPI_MSG_ERROR, "unable to open '%s'", name); + return rc; + } + + if ((rc = llapi_stripe_limit_check(stripe_size, stripe_offset, + stripe_count, stripe_pattern)) != 0) { + errno = rc; goto out; } @@ -293,6 +302,74 @@ out: return fd; } +static int poolpath(char *fsname, char *pathname, char *pool_pathname); + +int llapi_file_open_pool(const char *name, int flags, int mode, + unsigned long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, char *pool_name) +{ + struct lov_user_md_v3 lum = { 0 }; + int fd, rc = 0; + int isdir = 0; + char fsname[MAX_OBD_NAME + 1], *ptr; + + fd = open(name, flags | O_LOV_DELAY_CREATE, mode); + if (fd < 0 && errno == EISDIR) { + fd = open(name, O_DIRECTORY | O_RDONLY); + isdir++; + } + + if (fd < 0) { + rc = -errno; + llapi_err(LLAPI_MSG_ERROR, "unable to open '%s'", name); + return rc; + } + + if ((rc = llapi_stripe_limit_check(stripe_size, stripe_offset, + stripe_count, stripe_pattern)) != 0) { + errno = rc; + goto out; + } + + /* in case user give the full pool name ., skip + * the fsname */ + ptr = strchr(pool_name, '.'); + if (ptr != NULL) { + strncpy(fsname, pool_name, ptr - pool_name); + fsname[ptr - pool_name] = '\0'; + /* if fsname matches a fs skip it + * if not keep the poolname as is */ + if (poolpath(fsname, NULL, NULL) == 0) + pool_name = ptr + 1; + } + + /* Initialize IOCTL striping pattern structure */ + lum.lmm_magic = LOV_USER_MAGIC_V3; + lum.lmm_pattern = stripe_pattern; + lum.lmm_stripe_size = stripe_size; + lum.lmm_stripe_count = stripe_count; + lum.lmm_stripe_offset = stripe_offset; + strncpy(lum.lmm_pool_name, pool_name, MAXPOOLNAME); + + if (ioctl(fd, LL_IOC_LOV_SETSTRIPE, &lum)) { + char *errmsg = "stripe already set"; + rc = -errno; + if (errno != EEXIST && errno != EALREADY) + errmsg = strerror(errno); + + llapi_err_noerrno(LLAPI_MSG_ERROR, + "error on ioctl "LPX64" for '%s' (%d): %s", + (__u64)LL_IOC_LOV_SETSTRIPE, name, fd, errmsg); + } +out: + if (rc) { + close(fd); + fd = rc; + } + + return fd; +} + int llapi_file_create(const char *name, unsigned long stripe_size, int stripe_offset, int stripe_count, int stripe_pattern) { @@ -307,6 +384,202 @@ int llapi_file_create(const char *name, unsigned long stripe_size, return 0; } +int llapi_file_create_pool(const char *name, unsigned long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern, char *pool_name) +{ + int fd; + + fd = llapi_file_open_pool(name, O_CREAT | O_WRONLY, 0644, stripe_size, + stripe_offset, stripe_count, stripe_pattern, + pool_name); + if (fd < 0) + return fd; + + close(fd); + return 0; +} + + +static int print_pool_members(char *fs, char *pool_dir, char *pool_file) +{ + char path[PATH_MAX + 1]; + char buf[1024]; + FILE *fd; + + llapi_printf(LLAPI_MSG_NORMAL, "Pool: %s.%s\n", fs, pool_file); + sprintf(path, "%s/%s", pool_dir, pool_file); + if ((fd = fopen(path, "r")) == NULL) { + llapi_err(LLAPI_MSG_ERROR, "Cannot open %s\n", path); + return -EINVAL; + } + while (fgets(buf, sizeof(buf), fd) != NULL) + llapi_printf(LLAPI_MSG_NORMAL, buf); + + fclose(fd); + return 0; +} + +/* + * search lustre fsname from pathname + * + */ +static int search_fsname(char *pathname, char *fsname) +{ + char *ptr; + FILE *fp; + struct mntent *mnt = NULL; + + /* get the mount point */ + fp = setmntent(MOUNTED, "r"); + if (fp == NULL) { + llapi_err(LLAPI_MSG_ERROR, + "setmntent(%s) failed: %s:", MOUNTED, + strerror (errno)); + return -EIO; + } + mnt = getmntent(fp); + while ((feof(fp) == 0) && ferror(fp) == 0) { + if (llapi_is_lustre_mnt(mnt)) { + /* search by pathname */ + if (strncmp(mnt->mnt_dir, pathname, + strlen(mnt->mnt_dir)) == 0) { + ptr = strchr(mnt->mnt_fsname, '/'); + if (ptr == NULL) + return -EINVAL; + ptr++; + strcpy(fsname, ptr); + return 0; + } + } + mnt = getmntent(fp); + } + endmntent(fp); + return -ENOENT; + +} + +/* + * find the pool directory path under /proc + * (can be also used to test if a fsname is known) + */ +static int poolpath(char *fsname, char *pathname, char *pool_pathname) +{ + int rc = 0; + glob_t glob_info; + char pattern[PATH_MAX + 1]; + char buffer[PATH_MAX]; + + if (fsname == NULL) { + rc = search_fsname(pathname, buffer); + if (rc != 0) + return rc; + fsname = buffer; + strcpy(pathname, fsname); + } + + snprintf(pattern, PATH_MAX, + "/proc/fs/lustre/lov/%s-*/pools", + fsname); + rc = glob(pattern, GLOB_BRACE, NULL, &glob_info); + if (rc) + return -ENOENT; + + if (glob_info.gl_pathc == 0) { + globfree(&glob_info); + return -ENOENT; + } + + /* in fsname test mode, pool_pathname is NULL */ + if (pool_pathname != NULL) + strcpy(pool_pathname, glob_info.gl_pathv[0]); + + return 0; +} + +int llapi_poollist(char *name) +{ + char *poolname; + char *fsname; + char rname[PATH_MAX + 1], pathname[PATH_MAX + 1]; + char *ptr; + int rc = 0; + + /* is name a pathname ? */ + ptr = strchr(name, '/'); + if (ptr != NULL) { + /* only absolute pathname is supported */ + if (*name != '/') + return -EINVAL; + if (!realpath(name, rname)) { + rc = -errno; + llapi_err(LLAPI_MSG_ERROR, + "llapi_poollist: invalid path '%s'", + name); + return rc; + } + + rc = poolpath(NULL, rname, pathname); + if (rc != 0) { + errno = -rc; + llapi_err(LLAPI_MSG_ERROR, + "llapi_poollist: '%s' is not" + " a Lustre filesystem", + name); + return rc; + } + fsname = rname; + poolname = NULL; + } else { + /* name is FSNAME[.POOLNAME] */ + fsname = name; + poolname = strchr(name, '.'); + if (poolname != NULL) { + *poolname = '\0'; + poolname++; + } + rc = poolpath(fsname, NULL, pathname); + if (rc != 0) { + errno = -rc; + llapi_err(LLAPI_MSG_ERROR, + "llapi_poollist: Lustre filesystem '%s'" + " not found", name); + return rc; + } + } + if (rc != 0) { + errno = -rc; + llapi_err(LLAPI_MSG_ERROR, + "llapi_poollist: Lustre filesystem '%s' not found", + name); + return rc; + } + + if (poolname != NULL) { + rc = print_pool_members(fsname, pathname, poolname); + poolname--; + *poolname = '.'; + } else { + DIR *dir; + struct dirent *pool; + + llapi_printf(LLAPI_MSG_NORMAL, "Pools from %s:\n", fsname); + if ((dir = opendir(pathname)) == NULL) { + return -EINVAL; + } + while ((pool = readdir(dir)) != NULL) { + if (!((pool->d_name[0] == '.') && + (pool->d_name[1] == '\0')) && + !((pool->d_name[0] == '.') && + (pool->d_name[1] == '.') && + (pool->d_name[2] == '\0'))) + llapi_printf(LLAPI_MSG_NORMAL, " %s.%s\n", fsname, pool->d_name); + } + closedir(dir); + } + return rc; +} + typedef int (semantic_func_t)(char *path, DIR *parent, DIR *d, void *data, cfs_dirent_t *de); @@ -315,9 +588,9 @@ typedef int (semantic_func_t)(char *path, DIR *parent, DIR *d, static int common_param_init(struct find_param *param) { - param->lumlen = lov_mds_md_size(MAX_LOV_UUID_COUNT); + param->lumlen = lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC_V3); if ((param->lmd = malloc(sizeof(lstat_t) + param->lumlen)) == NULL) { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "error: allocation of %d bytes for ioctl", sizeof(lstat_t) + param->lumlen); return -ENOMEM; @@ -338,7 +611,7 @@ static void find_param_fini(struct find_param *param) free(param->lmd); } -int llapi_file_get_lov_fuuid(int fd, struct obd_uuid *lov_name) +int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_name) { int rc = ioctl(fd, OBD_IOC_GETNAME, lov_name); if (rc) { @@ -355,11 +628,11 @@ int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid) fd = open(path, O_RDONLY); if (fd < 0) { rc = errno; - llapi_err(LLAPI_MSG_ERROR, "error opening %s\n", path); + llapi_err(LLAPI_MSG_ERROR, "error opening %s", path); return rc; } - rc = llapi_file_get_lov_fuuid(fd, lov_uuid); + rc = llapi_file_fget_lov_uuid(fd, lov_uuid); close(fd); @@ -380,7 +653,7 @@ int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count) int rc = 0, index = 0; /* Get the lov name */ - rc = llapi_file_get_lov_fuuid(fd, &lov_name); + rc = llapi_file_fget_lov_uuid(fd, &lov_name); if (rc) return rc; @@ -422,11 +695,11 @@ static int setup_obd_uuid(DIR *dir, char *dname, struct find_param *param) int rc = 0, index; /* Get the lov name */ - rc = llapi_file_get_lov_fuuid(dirfd(dir), &lov_uuid); + rc = llapi_file_fget_lov_uuid(dirfd(dir), &lov_uuid); if (rc) { if (errno != ENOTTY) { rc = errno; - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "error: can't get lov name: %s", dname); } else { rc = 0; @@ -470,7 +743,7 @@ static int setup_obd_uuid(DIR *dir, char *dname, struct find_param *param) if (!param->quiet && param->obduuid && (param->obdindex == OBD_NOT_FOUND)) { - llapi_err_noerrno(LLAPI_MSG_ERROR, + llapi_err_noerrno(LLAPI_MSG_ERROR, "error: %s: unknown obduuid: %s", __FUNCTION__, param->obduuid->uuid); //rc = EINVAL; @@ -539,14 +812,16 @@ retry_get_uuids: return 0; } -void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *path, int is_dir, - int obdindex, int quiet, int header, int body) +void lov_dump_user_lmm_v1v3(struct lov_user_md *lum, char *pool_name, + struct lov_user_ost_data_v1 *objects, + char *path, int is_dir, + int obdindex, int quiet, int header, int body) { int i, obdstripe = 0; if (obdindex != OBD_NOT_FOUND) { for (i = 0; !is_dir && i < lum->lmm_stripe_count; i++) { - if (obdindex == lum->lmm_objects[i].l_ost_idx) { + if (obdindex == objects[i].l_ost_idx) { llapi_printf(LLAPI_MSG_NORMAL, "%s\n", path); obdstripe = 1; break; @@ -564,44 +839,49 @@ void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *path, int is_dir, llapi_printf(LLAPI_MSG_NORMAL, "(Default) "); lum->lmm_object_gr = LOV_OBJECT_GROUP_CLEAR; } - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "stripe_count: %d stripe_size: %u " - "stripe_offset: %d\n", + "stripe_offset: %d%s%s\n", lum->lmm_stripe_count == (__u16)-1 ? -1 : - lum->lmm_stripe_count, + lum->lmm_stripe_count, lum->lmm_stripe_size, lum->lmm_stripe_offset == (__u16)-1 ? -1 : - lum->lmm_stripe_offset); + lum->lmm_stripe_offset, + pool_name != NULL ? " pool: " : "", + pool_name != NULL ? pool_name : ""); } return; } if (header && (obdstripe == 1)) { - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic: 0x%08X\n", lum->lmm_magic); - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr: "LPX64"\n", lum->lmm_object_gr); - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id: "LPX64"\n", lum->lmm_object_id); - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count: %u\n", (int)lum->lmm_stripe_count); - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_size: %u\n", lum->lmm_stripe_size); - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_pattern: %x\n", lum->lmm_pattern); + if (pool_name != NULL) + llapi_printf(LLAPI_MSG_NORMAL, + "lmm_pool_name: %s\n", pool_name); } if (body) { if ((!quiet) && (obdstripe == 1)) - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "\tobdidx\t\t objid\t\tobjid\t\t group\n"); for (i = 0; i < lum->lmm_stripe_count; i++) { - int idx = lum->lmm_objects[i].l_ost_idx; - long long oid = lum->lmm_objects[i].l_object_id; - long long gr = lum->lmm_objects[i].l_object_gr; + int idx = objects[i].l_ost_idx; + long long oid = objects[i].l_object_id; + long long gr = objects[i].l_object_gr; if ((obdindex == OBD_NOT_FOUND) || (obdindex == idx)) - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "\t%6u\t%14llu\t%#13llx\t%14llu%s\n", idx, oid, oid, gr, obdindex == idx ? " *" : ""); @@ -631,13 +911,13 @@ void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path, } if (header && obdstripe == 1) { - llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic: 0x%08X\n", + llapi_printf(LLAPI_MSG_NORMAL, "lmm_magic: 0x%08X\n", lumj->lmm_magic); - llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr: "LPX64"\n", + llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_gr: "LPX64"\n", lumj->lmm_object_gr); - llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id: "LPX64"\n", + llapi_printf(LLAPI_MSG_NORMAL, "lmm_object_id: "LPX64"\n", lumj->lmm_object_id); - llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count: %u\n", + llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_count: %u\n", (int)lumj->lmm_stripe_count); llapi_printf(LLAPI_MSG_NORMAL, "lmm_stripe_size: %u\n", lumj->lmm_stripe_size); @@ -650,7 +930,7 @@ void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path, if (body) { unsigned long long start = -1, end = 0; if (!quiet && obdstripe == 1) - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "joined\tobdidx\t\t objid\t\tobjid\t\t group" "\t\tstart\t\tend\n"); for (i = 0; i < lumj->lmm_stripe_count; i++) { @@ -658,7 +938,7 @@ void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path, long long oid = lumj->lmm_objects[i].l_object_id; long long gr = lumj->lmm_objects[i].l_object_gr; if (obdindex == OBD_NOT_FOUND || obdindex == idx) - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "\t%6u\t%14llu\t%#13llx\t%14llu%s", idx, oid, oid, gr, obdindex == idx ? " *" : ""); @@ -668,10 +948,10 @@ void lov_dump_user_lmm_join(struct lov_user_md_v1 *lum, char *path, llapi_printf(LLAPI_MSG_NORMAL, "\t%14llu", start); end = lumj->lmm_objects[i].l_extent_end; if (end == (unsigned long long)-1) - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "\t\tEOF\n"); else - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "\t\t%llu\n", end); } else { llapi_printf(LLAPI_MSG_NORMAL, "\t\t\t\t\n"); @@ -686,10 +966,12 @@ void llapi_lov_dump_user_lmm(struct find_param *param, { switch(*(__u32 *)¶m->lmd->lmd_lmm) { /* lum->lmm_magic */ case LOV_USER_MAGIC_V1: - lov_dump_user_lmm_v1(¶m->lmd->lmd_lmm, path, is_dir, - param->obdindex, param->quiet, - param->verbose, - (param->verbose || !param->obduuid)); + lov_dump_user_lmm_v1v3(¶m->lmd->lmd_lmm, NULL, + param->lmd->lmd_lmm.lmm_objects, + path, is_dir, + param->obdindex, param->quiet, + param->verbose, + (param->verbose || !param->obduuid)); break; case LOV_USER_MAGIC_JOIN: lov_dump_user_lmm_join(¶m->lmd->lmd_lmm, path, is_dir, @@ -697,10 +979,28 @@ void llapi_lov_dump_user_lmm(struct find_param *param, param->verbose, (param->verbose || !param->obduuid)); break; + case LOV_USER_MAGIC_V3: { + char pool_name[MAXPOOLNAME + 1]; + struct lov_user_ost_data_v1 *objects; + + strncpy(pool_name, + ((struct lov_user_md_v3 *)(¶m->lmd->lmd_lmm))->lmm_pool_name, + MAXPOOLNAME); + pool_name[MAXPOOLNAME] = '\0'; + objects = ((struct lov_user_md_v3 *)(¶m->lmd->lmd_lmm))->lmm_objects; + lov_dump_user_lmm_v1v3(¶m->lmd->lmd_lmm, pool_name, + objects, path, is_dir, + param->obdindex, param->quiet, + param->verbose, + (param->verbose || !param->obduuid)); + break; + } default: - llapi_printf(LLAPI_MSG_NORMAL, - "unknown lmm_magic: %#x (expecting %#x)\n", - *(__u32 *)¶m->lmd->lmd_lmm, LOV_USER_MAGIC_V1); + llapi_printf(LLAPI_MSG_NORMAL, "unknown lmm_magic: %#x " + "(expecting one of %#x %#x %#x)\n", + *(__u32 *)¶m->lmd->lmd_lmm, + LOV_USER_MAGIC_V1, LOV_USER_MAGIC_JOIN, + LOV_USER_MAGIC_V3); return; } } @@ -785,7 +1085,8 @@ int llapi_mds_getfileinfo(char *path, DIR *parent, fname = (fname == NULL ? path : fname + 1); /* retrieve needed file info */ - strncpy((char *)lmd, fname, lov_mds_md_size(MAX_LOV_UUID_COUNT)); + strncpy((char *)lmd, fname, + lov_mds_md_size(MAX_LOV_UUID_COUNT, LOV_MAGIC)); ret = ioctl(dirfd(parent), IOC_MDC_GETFILEINFO, (void *)lmd); if (ret) { @@ -794,18 +1095,18 @@ int llapi_mds_getfileinfo(char *path, DIR *parent, * Do the regular lstat(2) instead. */ ret = lstat_f(path, st); if (ret) { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "error: %s: lstat failed for %s", __FUNCTION__, path); return ret; } } else if (errno == ENOENT) { - llapi_err(LLAPI_MSG_WARN, - "warning: %s: %s does not exist", + llapi_err(LLAPI_MSG_WARN, + "warning: %s: %s does not exist", __FUNCTION__, path); return -ENOENT; } else { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "error: %s: IOC_MDC_GETFILEINFO failed for %s", __FUNCTION__, path); return ret; @@ -894,7 +1195,7 @@ static int llapi_semantic_traverse(char *path, int size, DIR *parent, switch (dent->d_type) { case DT_UNKNOWN: - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "error: %s: '%s' is UNKNOWN type %d", __FUNCTION__, dent->d_name, dent->d_type); break; @@ -1089,13 +1390,13 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir, lustre_fs = 0; ret = lstat_f(path, st); if (ret) { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "error: %s: lstat failed for %s", __FUNCTION__, path); return ret; } } else if (errno == ENOENT) { - llapi_err(LLAPI_MSG_WARN, + llapi_err(LLAPI_MSG_WARN, "warning: %s: %s does not exist", __FUNCTION__, path); goto decided; @@ -1155,11 +1456,21 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir, goto decided; } else { int i, j; + struct lov_user_ost_data_v1 *lmm_objects; + + if (param->lmd->lmd_lmm.lmm_magic == + LOV_USER_MAGIC_V3) { + lmm_objects = + ((struct lov_user_md_v3 *)(&(param->lmd->lmd_lmm)))->lmm_objects; + } else { + lmm_objects = param->lmd->lmd_lmm.lmm_objects; + } + for (i = 0; i < param->lmd->lmd_lmm.lmm_stripe_count; i++) { for (j = 0; j < param->num_obds; j++) { if (param->obdindexes[j] == - param->lmd->lmd_lmm.lmm_objects[i].l_ost_idx) + lmm_objects[i].l_ost_idx) goto obd_matches; } } @@ -1189,6 +1500,23 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir, } } + if (param->check_pool) { + /* empty requested pool is taken as no pool search => V1 */ + if (((param->lmd->lmd_lmm.lmm_magic == LOV_USER_MAGIC_V1) && + (param->poolname[0] == '\0')) || + ((param->lmd->lmd_lmm.lmm_magic == LOV_USER_MAGIC_V3) && + (strncmp(((struct lov_user_md_v3 *)(&(param->lmd->lmd_lmm)))->lmm_pool_name, + param->poolname, MAXPOOLNAME) == 0)) || + ((param->lmd->lmd_lmm.lmm_magic == LOV_USER_MAGIC_V3) && + (strcmp(param->poolname, "*") == 0))) { + if (param->exclude_pool) + goto decided; + } else { + if (!param->exclude_pool) + goto decided; + } + } + /* Check the time on mds. */ if (!decision) { int for_mds; @@ -1208,7 +1536,7 @@ obd_matches: if (param->obdindex != OBD_NOT_FOUND) { /* Check whether the obd is active or not, if it is * not active, just print the object affected by this - * failed ost + * failed ost * */ struct obd_statfs stat_buf; struct obd_uuid uuid_buf; @@ -1216,15 +1544,15 @@ obd_matches: memset(&stat_buf, 0, sizeof(struct obd_statfs)); memset(&uuid_buf, 0, sizeof(struct obd_uuid)); ret = llapi_obd_statfs(path, LL_STATFS_LOV, - param->obdindex, &stat_buf, + param->obdindex, &stat_buf, &uuid_buf); if (ret) { - if (ret == -ENODATA || ret == -ENODEV + if (ret == -ENODATA || ret == -ENODEV || ret == -EIO) errno = EIO; - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "obd_uuid: %s failed %s ", - param->obduuid->uuid, + param->obduuid->uuid, strerror(errno)); goto print_path; } @@ -1239,12 +1567,12 @@ obd_matches: if (ret) { if (errno == ENOENT) { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "warning: %s: %s does not exist", __FUNCTION__, path); goto decided; } else { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "%s: IOC_LOV_GETINFO on %s failed", __FUNCTION__, path); return ret; @@ -1350,20 +1678,20 @@ static int cb_getstripe(char *path, DIR *parent, DIR *d, void *data, if (ret) { if (errno == ENODATA) { if (!param->obduuid && !param->quiet) - llapi_printf(LLAPI_MSG_NORMAL, + llapi_printf(LLAPI_MSG_NORMAL, "%s has no stripe info\n", path); goto out; } else if (errno == ENOTTY) { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "%s: '%s' not on a Lustre fs?", __FUNCTION__, path); } else if (errno == ENOENT) { - llapi_err(LLAPI_MSG_WARN, + llapi_err(LLAPI_MSG_WARN, "warning: %s: %s does not exist", __FUNCTION__, path); goto out; } else { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "error: %s: %s failed for %s", __FUNCTION__, d ? "LL_IOC_LOV_GETSTRIPE" : "IOC_MDC_GETFILESTRIPE", path); @@ -1388,7 +1716,7 @@ int llapi_getstripe(char *path, struct find_param *param) int ret = 0, len = strlen(path); if (len > PATH_MAX) { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "%s: Path name '%s' is too long", __FUNCTION__, path); return -EINVAL; @@ -1434,7 +1762,7 @@ int llapi_obd_statfs(char *path, __u32 type, __u32 index, data.ioc_plen2 = sizeof(struct obd_uuid); if ((rc = obd_ioctl_pack(&data, &rawbuf, sizeof(raw))) != 0) { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "llapi_obd_statfs: error packing ioctl data"); return rc; } @@ -1445,7 +1773,7 @@ int llapi_obd_statfs(char *path, __u32 type, __u32 index, if (fd < 0) { rc = errno ? -errno : -EBADF; - llapi_err(LLAPI_MSG_ERROR, "error: %s: opening '%s'", + llapi_err(LLAPI_MSG_ERROR, "error: %s: opening '%s'", __FUNCTION__, path); return rc; } @@ -1694,11 +2022,11 @@ static int cb_quotachown(char *path, DIR *parent, DIR *d, void *data, if (rc) { if (errno == ENODATA) { if (!param->obduuid && !param->quiet) - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "%s has no stripe info", path); rc = 0; } else if (errno == ENOENT) { - llapi_err(LLAPI_MSG_ERROR, + llapi_err(LLAPI_MSG_ERROR, "warning: %s: %s does not exist", __FUNCTION__, path); rc = 0; diff --git a/lustre/utils/llog_reader.c b/lustre/utils/llog_reader.c index f173b56..d1fd956 100644 --- a/lustre/utils/llog_reader.c +++ b/lustre/utils/llog_reader.c @@ -398,6 +398,26 @@ void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip) createtime, canceltime); break; } + case(LCFG_POOL_NEW):{ + printf("pool new "); + print_1_cfg(lcfg); + break; + } + case(LCFG_POOL_ADD):{ + printf("pool add "); + print_1_cfg(lcfg); + break; + } + case(LCFG_POOL_REM):{ + printf("pool remove "); + print_1_cfg(lcfg); + break; + } + case(LCFG_POOL_DEL):{ + printf("pool destroy "); + print_1_cfg(lcfg); + break; + } default: printf("unsupported cmd_code = %x\n",cmd); } diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 00ccd91..5a5bde3 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "obdctl.h" @@ -69,6 +70,7 @@ #include #include #include +#include #define MAX_STRING_SIZE 128 #define DEVICES_LIST "/proc/fs/lustre/devices" @@ -164,42 +166,51 @@ int lcfg_ioctl(char * func, int dev_id, struct lustre_cfg *lcfg) static int do_device(char *func, char *devname); -int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg) +static int get_mgs_device() { - struct obd_ioctl_data data; - static int mgs_device = -1; char mgs[] = "$MGS"; - int rc; + static int mgs_device = -1; - /* Always operates on MGS dev */ if (mgs_device == -1) { + int rc; do_disconnect(NULL, 1); rc = do_device("mgsioc", mgs); if (rc) { + fprintf(stderr, + "This command must be run on the MGS.\n"); errno = ENODEV; return -1; } mgs_device = cur_device; } + return mgs_device; +} + +/* Returns -1 on error with errno set */ +int lcfg_mgs_ioctl(char *func, int dev_id, struct lustre_cfg *lcfg) +{ + struct obd_ioctl_data data; + int rc; IOC_INIT(data); - data.ioc_dev = mgs_device; + rc = data.ioc_dev = get_mgs_device(); + if (rc < 0) + goto out; data.ioc_type = LUSTRE_CFG_TYPE; data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens); data.ioc_pbuf1 = (void *)lcfg; IOC_PACK(func, data); - rc = l_ioctl(dev_id, OBD_IOC_PARAM, buf); - - if (rc == ENODEV) - fprintf(stderr, "Is the MGS running on this node?\n"); - if (rc == ENOSYS) - fprintf(stderr, "Make sure cfg_device is set first.\n"); - if (rc == EINVAL) - fprintf(stderr, "cfg_device should be of the form " - "'lustre-MDT0000'\n"); - + rc = l_ioctl(dev_id, OBD_IOC_PARAM, buf); +out: + if (rc) { + if (errno == ENOSYS) + fprintf(stderr, "Make sure cfg_device is set first.\n"); + if (errno == EINVAL) + fprintf(stderr, "cfg_device should be of the form " + "'lustre-MDT0000'\n"); + } return rc; } @@ -2334,3 +2345,591 @@ void obd_finalize(int argc, char **argv) shmem_stop(); do_disconnect(argv[0], 1); } + +static int find_target_obdpath(char *fsname, char *path) +{ + glob_t glob_info; + char pattern[PATH_MAX + 1]; + int rc; + + snprintf(pattern, PATH_MAX, + "/proc/fs/lustre/lov/%s-*/target_obd", + fsname); + rc = glob(pattern, GLOB_BRACE, NULL, &glob_info); + if (rc) + return -EINVAL; + + if (glob_info.gl_pathc == 0) { + globfree(&glob_info); + return -EINVAL; + } + + strcpy(path, glob_info.gl_pathv[0]); + return 0; +} + +static int find_poolpath(char *fsname, char *poolname, char *poolpath) +{ + glob_t glob_info; + char pattern[PATH_MAX + 1]; + int rc; + + snprintf(pattern, PATH_MAX, + "/proc/fs/lustre/lov/%s-*/pools/%s", + fsname, poolname); + rc = glob(pattern, GLOB_BRACE, NULL, &glob_info); + if (rc) + return -EINVAL; + + if (glob_info.gl_pathc == 0) { + globfree(&glob_info); + return -EINVAL; + } + + strcpy(poolpath, glob_info.gl_pathv[0]); + return 0; +} + +/* + * if pool is NULL, search ostname in target_obd + * if pool is no NULL + * if pool not found returns < 0 + * if ostname is NULL, returns 1 if pool is not empty and 0 if pool empty + * if ostname is not NULL, returns 1 if OST is in pool and 0 if not + */ +static int search_ost(char *fsname, char *poolname, char *ostname) +{ + FILE *fd; + char buffer[PATH_MAX + 1]; + int len = 0, rc; + + if (ostname != NULL) + len = strlen(ostname); + + if (poolname == NULL) + rc = find_target_obdpath(fsname, buffer); + else + rc = find_poolpath(fsname, poolname, buffer); + if (rc) + return rc; + + if ((fd = fopen(buffer, "r")) == NULL) + return -EINVAL; + + while (fgets(buffer, sizeof(buffer), fd) != NULL) { + if (poolname == NULL) { + /* we search ostname in target_obd */ + if (strncmp(buffer + 3, ostname, len) == 0) { + fclose(fd); + return 1; + } + } else { + /* we search a non empty pool or + an ostname in a pool */ + if ((ostname == NULL) || + (strncmp(buffer, ostname, len) == 0)) { + fclose(fd); + return 1; + } + } + } + fclose(fd); + return 0; +} + +static int check_pool_cmd(enum lcfg_command_type cmd, + char *fsname, char *poolname, + char *ostname) +{ + int rc = 0; + + switch (cmd) { + case LCFG_POOL_NEW: { + if (search_ost(fsname, poolname, NULL) >= 0) { + fprintf(stderr, "Pool %s.%s already exists\n", + fsname, poolname); + return -EEXIST; + } + return 0; + } + case LCFG_POOL_DEL: { + rc = search_ost(fsname, poolname, NULL); + if (rc < 0) { + fprintf(stderr, "Pool %s.%s not found\n", + fsname, poolname); + return -ENOENT; + } + if (rc == 1) { + fprintf(stderr, "Pool %s.%s not empty, " + "please remove all members\n", + fsname, poolname); + return -ENOTEMPTY; + } + return 0; + } + case LCFG_POOL_ADD: { + rc = search_ost(fsname, NULL, ostname); + if (rc == 0) { + fprintf(stderr, "OST %s not found in lov of %s\n", + ostname, fsname); + return -ENOENT; + } + rc = search_ost(fsname, poolname, ostname); + if (rc < 0) { + fprintf(stderr, "Pool %s.%s not found\n", + fsname, poolname); + return -ENOENT; + } + if (rc == 1) { + fprintf(stderr, "OST %s already in pool %s.%s\n", + ostname, fsname, poolname); + return -EEXIST; + } + return 0; + } + case LCFG_POOL_REM: { + rc = search_ost(fsname, poolname, ostname); + if (rc < 0) { + fprintf(stderr, "Pool %s.%s not found\n", + fsname, poolname); + return -ENOENT; + } + if (rc == 0) { + fprintf(stderr, "OST %s not found in pool %s.%s\n", + ostname, fsname, poolname); + return -ENOENT; + } + return 0; + } + default: { + } + } + return 0; +} + +static void check_pool_cmd_result(enum lcfg_command_type cmd, + char *fsname, char *poolname, + char *ostname) +{ + int cpt, rc = 0; + + cpt = 10; + switch (cmd) { + case LCFG_POOL_NEW: { + do { + rc = search_ost(fsname, poolname, NULL); + if (rc < 0) + sleep(2); + cpt--; + } while ((rc < 0) && (cpt > 0)); + if (rc >= 0) + fprintf(stderr, "Pool %s.%s created\n", + fsname, poolname); + else + fprintf(stderr, "Warning, pool %s.%s not found\n", + fsname, poolname); + return; + } + case LCFG_POOL_DEL: { + do { + rc = search_ost(fsname, poolname, NULL); + if (rc >= 0) + sleep(2); + cpt--; + } while ((rc >= 0) && (cpt > 0)); + if (rc < 0) + fprintf(stderr, "Pool %s.%s destroyed\n", + fsname, poolname); + else + fprintf(stderr, "Warning, pool %s.%s still found\n", + fsname, poolname); + return; + } + case LCFG_POOL_ADD: { + do { + rc = search_ost(fsname, poolname, ostname); + if (rc != 1) + sleep(2); + cpt--; + } while ((rc != 1) && (cpt > 0)); + if (rc == 1) + fprintf(stderr, "OST %s added to pool %s.%s\n", + ostname, fsname, poolname); + else + fprintf(stderr, "Warning, OST %s not found in pool %s.%s\n", + ostname, fsname, poolname); + return; + } + case LCFG_POOL_REM: { + do { + rc = search_ost(fsname, poolname, ostname); + if (rc == 1) + sleep(2); + cpt--; + } while ((rc == 1) && (cpt > 0)); + if (rc != 1) + fprintf(stderr, "OST %s removed from pool %s.%s\n", + ostname, fsname, poolname); + else + fprintf(stderr, "Warning, OST %s still found in pool %s.%s\n", + ostname, fsname, poolname); + return; + } + default: { + } + } +} + +static int check_and_complete_ostname(char *fsname, char *ostname) +{ + char *ptr; + char real_ostname[MAX_OBD_NAME + 1]; + char i; + + /* if OST name does not start with fsname, we add it */ + /* if not check if the fsname is the right one */ + ptr = strchr(ostname, '-'); + if (ptr == NULL) { + sprintf(real_ostname, "%s-%s", fsname, ostname); + } else if (strncmp(ostname, fsname, strlen(fsname)) != 0) { + fprintf(stderr, "%s does not start with fsname %s\n", + ostname, fsname); + return -EINVAL; + } else { + strcpy(real_ostname, ostname); + } + /* real_ostname is fsname-????? */ + ptr = real_ostname + strlen(fsname) + 1; + if (strncmp(ptr, "OST", 3) != 0) { + fprintf(stderr, "%s does not start by %s-OST nor OST\n", + ostname, fsname); + return -EINVAL; + } + /* real_ostname is fsname-OST????? */ + ptr += 3; + for (i = 0; i < 4; i++) { + if (!isxdigit(*ptr)) { + fprintf(stderr, + "ost's index in %s is not an hexa number\n", + ostname); + return -EINVAL; + } + ptr++; + } + /* real_ostname is fsname-OSTXXXX????? */ + /* if OST name does not end with _UUID, we add it */ + if (*ptr == '\0') { + strcat(real_ostname, "_UUID"); + } else if (strcmp(ptr, "_UUID") != 0) { + fprintf(stderr, + "ostname %s does not end with _UUID\n", ostname); + return -EINVAL; + } + /* real_ostname is fsname-OSTXXXX_UUID */ + strcpy(ostname, real_ostname); + return 0; +} + +/* returns 0 or -errno */ +static int pool_cmd(enum lcfg_command_type cmd, + char *cmdname, char *fullpoolname, + char *fsname, char *poolname, char *ostname) +{ + int rc = 0; + struct obd_ioctl_data data; + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + + rc = check_pool_cmd(cmd, fsname, poolname, ostname); + if (rc) + return rc; + + lustre_cfg_bufs_reset(&bufs, NULL); + lustre_cfg_bufs_set_string(&bufs, 0, cmdname); + lustre_cfg_bufs_set_string(&bufs, 1, fullpoolname); + if (ostname != NULL) + lustre_cfg_bufs_set_string(&bufs, 2, ostname); + + lcfg = lustre_cfg_new(cmd, &bufs); + if (IS_ERR(lcfg)) { + rc = PTR_ERR(lcfg); + return rc; + } + + IOC_INIT(data); + rc = data.ioc_dev = get_mgs_device(); + if (rc < 0) + goto out; + + data.ioc_type = LUSTRE_CFG_TYPE; + data.ioc_plen1 = lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens); + data.ioc_pbuf1 = (void *)lcfg; + IOC_PACK(cmdname, data); + + rc = l_ioctl(OBD_DEV_ID, OBD_IOC_POOL, buf); +out: + if (rc) + rc = -errno; + lustre_cfg_free(lcfg); + return rc; +} + +/* + * this function tranforms a rule [start-end/step] into an array + * of matching numbers + * supported forms are: + * [start] : just this number + * [start-end] : all numbers from start to end + * [start-end/step] : numbers from start to end with increment of step + * on return, format contains a printf format string which can be used + * to generate all the strings + */ +static int get_array_idx(char *rule, char *format, int **array) +{ + char *start, *end, *ptr; + unsigned int lo, hi, step; + int array_sz = 0; + int i, array_idx; + int rc; + + start = strchr(rule, '['); + end = strchr(rule, ']'); + if ((start == NULL) || (end == NULL)) { + *array = malloc(sizeof(int)); + if (*array == NULL) + return 0; + strcpy(format, rule); + array_sz = 1; + return array_sz; + } + *start = '\0'; + *end = '\0'; + end++; + start++; + /* put in format the printf format (the rule without the range) */ + sprintf(format, "%s%%.4d%s", rule, end); + + array_idx = 0; + array_sz = 0; + *array = NULL; + /* loop on , separator */ + do { + /* extract the 3 fields */ + rc = sscanf(start, "%u-%u/%u", &lo, &hi, &step); + switch (rc) { + case 0: { + return 0; + } + case 1: { + array_sz++; + *array = realloc(*array, array_sz * sizeof(int)); + if (*array == NULL) + return 0; + (*array)[array_idx] = lo; + array_idx++; + break; + } + case 2: { + step = 1; + /* do not break to share code with case 3: */ + } + case 3: { + if ((hi < lo) || (step == 0)) + return 0; + array_sz += (hi - lo) / step + 1; + *array = realloc(*array, sizeof(int) * array_sz); + if (*array == NULL) + return 0; + for (i = lo; i <= hi; i+=step, array_idx++) + (*array)[array_idx] = i; + break; + } + } + ptr = strchr(start, ','); + if (ptr != NULL) + start = ptr + 1; + + } while (ptr != NULL); + return array_sz; +} + +static int extract_fsname_poolname(char *arg, char *fsname, char *poolname) +{ + char *ptr; + int len; + int rc; + + strcpy(fsname, arg); + ptr = strchr(fsname, '.'); + if (ptr == NULL) { + fprintf(stderr, ". is missing in %s\n", fsname); + rc = -EINVAL; + goto err; + } + + len = ptr - fsname; + if (len == 0) { + fprintf(stderr, "fsname is empty\n"); + rc = -EINVAL; + goto err; + } + + len = strlen(ptr + 1); + if (len == 0) { + fprintf(stderr, "poolname is empty\n"); + rc = -EINVAL; + goto err; + } + if (len > MAXPOOLNAME) { + fprintf(stderr, + "poolname %s is too long (length is %d max is %d)\n", + ptr + 1, len, MAXPOOLNAME); + rc = -ENAMETOOLONG; + goto err; + } + strncpy(poolname, ptr + 1, MAXPOOLNAME); + poolname[MAXPOOLNAME] = '\0'; + *ptr = '\0'; + return 0; + +err: + fprintf(stderr, "argument %s must be .\n", arg); + return rc; +} + +int jt_pool_cmd(int argc, char **argv) +{ + enum lcfg_command_type cmd; + char fsname[PATH_MAX + 1]; + char poolname[MAXPOOLNAME + 1]; + char *ostnames_buf = NULL; + int i, rc; + int *array = NULL, array_sz; + struct { + int rc; + char *ostname; + } *cmds = NULL; + + switch (argc) { + case 0: + case 1: return CMD_HELP; + case 2: { + if (strcmp("pool_new", argv[0]) == 0) + cmd = LCFG_POOL_NEW; + else if (strcmp("pool_destroy", argv[0]) == 0) + cmd = LCFG_POOL_DEL; + else if (strcmp("pool_list", argv[0]) == 0) + return llapi_poollist(argv[1]); + else return CMD_HELP; + + rc = extract_fsname_poolname(argv[1], fsname, poolname); + if (rc) + break; + + rc = pool_cmd(cmd, argv[0], argv[1], + fsname, poolname, NULL); + if (rc) + break; + + check_pool_cmd_result(cmd, fsname, poolname, NULL); + break; + } + default: { + char format[2*MAX_OBD_NAME]; + + if (strcmp("pool_remove", argv[0]) == 0) { + cmd = LCFG_POOL_REM; + } else if (strcmp("pool_add", argv[0]) == 0) { + cmd = LCFG_POOL_ADD; + } else { + return CMD_HELP; + } + + rc = extract_fsname_poolname(argv[1], fsname, poolname); + if (rc) + break; + + for (i = 2; i < argc; i++) { + int j; + + array_sz = get_array_idx(argv[i], format, &array); + if (array_sz == 0) + return CMD_HELP; + + cmds = malloc(array_sz * sizeof(cmds[0])); + if (cmds != NULL) { + ostnames_buf = malloc(array_sz * + (MAX_OBD_NAME + 1)); + } else { + free(array); + rc = -ENOMEM; + goto out; + } + + for (j = 0; j < array_sz; j++) { + char ostname[MAX_OBD_NAME + 1]; + + snprintf(ostname, MAX_OBD_NAME, format, + array[j]); + ostname[MAX_OBD_NAME] = '\0'; + + rc = check_and_complete_ostname(fsname,ostname); + if (rc) { + free(array); + free(cmds); + if (ostnames_buf) + free(ostnames_buf); + goto out; + } + if (ostnames_buf != NULL) { + cmds[j].ostname = + &ostnames_buf[(MAX_OBD_NAME + 1) * j]; + strcpy(cmds[j].ostname, ostname); + } else { + cmds[j].ostname = NULL; + } + cmds[j].rc = pool_cmd(cmd, argv[0], argv[1], + fsname, poolname, + ostname); + } + for (j = 0; j < array_sz; j++) { + if (!cmds[j].rc) { + char ostname[MAX_OBD_NAME + 1]; + + if (!cmds[j].ostname) { + snprintf(ostname, MAX_OBD_NAME, + format, array[j]); + ostname[MAX_OBD_NAME] = '\0'; + check_and_complete_ostname( + fsname, ostname); + } else { + strcpy(ostname, + cmds[j].ostname); + } + check_pool_cmd_result(cmd, fsname, + poolname,ostname); + } + } + if (array_sz > 0) + free(array); + if (cmds) + free(cmds); + if (ostnames_buf); + free(ostnames_buf); + } + return 0; + } + } + + +out: + if ((rc == -EINVAL) || (rc == -ENOENT)) + fprintf(stderr, "Does the fs, pool or ost exist?\n"); + if (rc != 0) { + errno = -rc; + perror(argv[0]); + } + + return rc; +} diff --git a/lustre/utils/obdctl.h b/lustre/utils/obdctl.h index a5fd90a..eeb1bb8 100644 --- a/lustre/utils/obdctl.h +++ b/lustre/utils/obdctl.h @@ -119,4 +119,6 @@ int jt_blockdev_attach(int argc, char **argv); int jt_blockdev_detach(int argc, char **argv); int jt_blockdev_info(int argc, char **argv); +int jt_pool_cmd(int argc, char **argv); + #endif diff --git a/lustre/utils/req-layout.c b/lustre/utils/req-layout.c index ef6d35b..dc366bd 100644 --- a/lustre/utils/req-layout.c +++ b/lustre/utils/req-layout.c @@ -60,7 +60,7 @@ #define lustre_swab_ldlm_request NULL #define lustre_swab_ldlm_reply NULL #define lustre_swab_ldlm_intent NULL -#define lustre_swab_lov_mds_md NULL +/* #define lustre_swab_lov_mds_md NULL */ #define lustre_swab_mdt_rec_reint NULL #define lustre_swab_lustre_capa NULL #define lustre_swab_lustre_capa_key NULL diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 05cdec0..e75cf6c 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -344,6 +344,33 @@ check_lov_mds_md_join(void) } static void +check_lov_mds_md_v3(void) +{ + BLANK_LINE(); + CHECK_STRUCT(lov_mds_md_v3); + CHECK_MEMBER(lov_mds_md_v3, lmm_magic); + CHECK_MEMBER(lov_mds_md_v3, lmm_pattern); + CHECK_MEMBER(lov_mds_md_v3, lmm_object_id); + CHECK_MEMBER(lov_mds_md_v3, lmm_object_gr); + CHECK_MEMBER(lov_mds_md_v3, lmm_stripe_size); + CHECK_MEMBER(lov_mds_md_v3, lmm_stripe_count); + CHECK_MEMBER(lov_mds_md_v3, lmm_pool_name); + CHECK_MEMBER(lov_mds_md_v3, lmm_objects); + + BLANK_LINE(); + CHECK_STRUCT(lov_ost_data_v1); + CHECK_MEMBER(lov_ost_data_v1, l_object_id); + CHECK_MEMBER(lov_ost_data_v1, l_object_gr); + CHECK_MEMBER(lov_ost_data_v1, l_ost_gen); + CHECK_MEMBER(lov_ost_data_v1, l_ost_idx); + + CHECK_CDEFINE(LOV_MAGIC_V3); + + CHECK_VALUE(LOV_PATTERN_RAID0); + CHECK_VALUE(LOV_PATTERN_RAID1); +} + +static void check_obd_statfs(void) { BLANK_LINE(); @@ -1307,6 +1334,7 @@ main(int argc, char **argv) check_obd_connect_data(); check_obdo(); check_lov_mds_md_v1(); + check_lov_mds_md_v3(); check_lov_mds_md_join(); check_obd_statfs(); check_obd_ioobj(); -- 1.8.3.1