From c1d0a355a6a64ec97c9f56c38ba036e5e50cd8c4 Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Mon, 5 Aug 2019 02:08:02 +0800 Subject: [PATCH 1/1] LU-12624 lod: alloc dir stripes by QoS Similar to file OST object allocation, introduce directory stripe allocation by space usage, but they don't share the same code because of the many differences between them: file has mirrors, PFL, object precreation; while for directory, the first stripe is always on the same MDT where its master object is on. The changes include: * add lod_mdt_alloc_qos() to allocate stripes by space/inode usage. * add lod_mdt_alloc_rr() to allocate stripes round-robin. * add lod_mdt_alloc_specific() to allocate stripes in the old way. * add sysfs support for lmv_desc field in LOD structure, and move those remain in procfs to sysfs. This patch also changes LMV QoS code: * mkdir by QoS if user mkdir by command 'lfs mkdir -i -1 ...', or the parent directory default LMV starting MDT index is -1. * with the above change, 'space' hash flag is useless, remove all related code. * previously 'lfs mkdir -i -1' QoS code is in lfs_setdirstripe(), but now it's done in LMV, remove the old code. Update sanity 413a 413b to support QoS mkdir of both plain and striped directories. Update lfs-setdirstripe man to reflect the changes. Signed-off-by: Lai Siyao Change-Id: I8f5f8e46faae68ffd9a49a4ac1d450e951e979c5 Reviewed-on: https://review.whamcloud.com/35825 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Hongchao Zhang Reviewed-by: Oleg Drokin --- lustre/doc/lfs-setdirstripe.1 | 7 - lustre/include/lprocfs_status.h | 1 + lustre/include/lustre_lmv.h | 12 - lustre/include/uapi/linux/lustre/lustre_user.h | 10 +- lustre/lmv/lmv_intent.c | 16 +- lustre/lmv/lmv_internal.h | 4 +- lustre/lmv/lmv_obd.c | 280 +++++----- lustre/lod/lod_internal.h | 18 +- lustre/lod/lod_lov.c | 71 ++- lustre/lod/lod_object.c | 288 ++++++---- lustre/lod/lod_pool.c | 31 +- lustre/lod/lod_qos.c | 638 ++++++++++++++++----- lustre/lod/lproc_lod.c | 732 ++++++++++++++----------- lustre/mdt/mdt_reint.c | 13 - lustre/obdclass/lprocfs_status.c | 41 +- lustre/obdclass/lu_tgt_descs.c | 17 +- lustre/ptlrpc/wiretest.c | 1 - lustre/tests/sanity.sh | 232 +++++--- lustre/utils/lfs.c | 140 +---- lustre/utils/liblustreapi.c | 4 +- lustre/utils/wirecheck.c | 1 - lustre/utils/wiretest.c | 1 - 22 files changed, 1520 insertions(+), 1038 deletions(-) diff --git a/lustre/doc/lfs-setdirstripe.1 b/lustre/doc/lfs-setdirstripe.1 index fda8be3..9f52fa8 100644 --- a/lustre/doc/lfs-setdirstripe.1 +++ b/lustre/doc/lfs-setdirstripe.1 @@ -44,13 +44,6 @@ Fowler-Noll-Vo (FNV-1a) hash algorithm. This provides reasonably uniform, but not cryptographically strong, hashing of the filename. (default) .TP -.B space -This can only be set on plain directory default striping. -Newly created sub-directories will be distributed on all -MDTs by MDT space usage. Note that this is suggested to -be set on lustre ROOT, so that all sub-directories under -lustre ROOT are distributed among all MDTs. -.TP .B all_char Sum of ASCII characters modulo number of MDTs. This provides weak hashing of the filename, and is suitable diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index 98a2b1a..2b66f7a 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -614,6 +614,7 @@ extern ssize_t lprocfs_pinger_recov_seq_write(struct file *file, const char __user *buffer, size_t count, loff_t *off); +int lu_str_to_s64(char *buffer, unsigned long count, __s64 *val, char defunit); extern int lprocfs_str_with_units_to_s64(const char __user *buffer, unsigned long count, __s64 *val, char defunit); diff --git a/lustre/include/lustre_lmv.h b/lustre/include/lustre_lmv.h index 45c5366..d32b4ac 100644 --- a/lustre/include/lustre_lmv.h +++ b/lustre/include/lustre_lmv.h @@ -54,12 +54,6 @@ struct lmv_stripe_md { struct lmv_oinfo lsm_md_oinfo[0]; }; -static inline bool lmv_is_known_hash_type(__u32 type) -{ - return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 || - (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS; -} - static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm) { return lsm && lsm->lsm_md_magic == LMV_MAGIC; @@ -88,12 +82,6 @@ static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm) return !lmv_is_known_hash_type(lsm->lsm_md_hash_type); } -/* NB, this is checking directory default LMV */ -static inline bool lmv_dir_qos_mkdir(const struct lmv_stripe_md *lsm) -{ - return lsm && (lsm->lsm_md_hash_type & LMV_HASH_FLAG_SPACE); -} - static inline bool lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2) { diff --git a/lustre/include/uapi/linux/lustre/lustre_user.h b/lustre/include/uapi/linux/lustre/lustre_user.h index d7d8d48..5c5cd41 100644 --- a/lustre/include/uapi/linux/lustre/lustre_user.h +++ b/lustre/include/uapi/linux/lustre/lustre_user.h @@ -712,6 +712,7 @@ struct fsxattr { #define LOV_PATTERN_DEFAULT 0xffffffff #define LOV_OFFSET_DEFAULT ((__u16)-1) +#define LMV_OFFSET_DEFAULT ((__u32)-1) static inline bool lov_pattern_supported(__u32 pattern) { @@ -1001,10 +1002,11 @@ enum lmv_hash_type { * might be interpreted differently with different flags. */ #define LMV_HASH_TYPE_MASK 0x0000ffff -/* once this is set on a plain directory default layout, newly created - * subdirectories will be distributed on all MDTs by space usage. - */ -#define LMV_HASH_FLAG_SPACE 0x08000000 +static inline bool lmv_is_known_hash_type(__u32 type) +{ + return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 || + (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS; +} /* The striped directory has ever lost its master LMV EA, then LFSCK * re-generated it. This flag is used to indicate such case. It is an diff --git a/lustre/lmv/lmv_intent.c b/lustre/lmv/lmv_intent.c index 43c6b66..11a78b1 100644 --- a/lustre/lmv/lmv_intent.c +++ b/lustre/lmv/lmv_intent.c @@ -305,22 +305,10 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, /* * open(O_CREAT | O_EXCL) needs to check * existing name, which should be done on both - * old and new layout, to avoid creating new - * file under old layout, check old layout on + * old and new layout, check old layout on * client side. */ - tgt = lmv_locate_tgt(lmv, op_data); - if (IS_ERR(tgt)) - RETURN(PTR_ERR(tgt)); - - rc = md_getattr_name(tgt->ltd_exp, op_data, - reqp); - if (!rc) { - ptlrpc_req_finished(*reqp); - *reqp = NULL; - RETURN(-EEXIST); - } - + rc = lmv_migrate_existence_check(lmv, op_data); if (rc != -ENOENT) RETURN(rc); diff --git a/lustre/lmv/lmv_internal.h b/lustre/lmv/lmv_internal.h index a58bebd..84a6d98 100644 --- a/lustre/lmv/lmv_internal.h +++ b/lustre/lmv/lmv_internal.h @@ -49,7 +49,6 @@ int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, int lmv_blocking_ast(struct ldlm_lock *, struct ldlm_lock_desc *, void *, int); int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds); -int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds); int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, struct lu_fid *fid, struct md_op_data *op_data); @@ -218,8 +217,9 @@ static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data) struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data); +int lmv_migrate_existence_check(struct lmv_obd *lmv, + struct md_op_data *op_data); /* lproc_lmv.c */ int lmv_tunables_init(struct obd_device *obd); - #endif diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index a01505b..8af14da 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -1055,111 +1055,38 @@ hsm_req_err: RETURN(rc); } -/** - * This is _inode_ placement policy function (not name). - */ -static u32 lmv_placement_policy(struct obd_device *obd, - struct md_op_data *op_data) +int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, + struct lu_fid *fid, struct md_op_data *op_data) { + struct obd_device *obd = class_exp2obd(exp); struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_user_md *lum; - u32 mdt; - - ENTRY; - - if (lmv->lmv_mdt_count == 1) - RETURN(0); - - lum = op_data->op_data; - /* - * Choose MDT by - * 1. See if the stripe offset is specified by lum. - * 2. If parent has default LMV, and its hash type is "space", choose - * MDT with QoS. (see lmv_locate_tgt_qos()). - * 3. Then check if default LMV stripe offset is not -1. - * 4. Finally choose MDS by name hash if the parent - * is striped directory. (see lmv_locate_tgt()). - * - * presently explicit MDT location is not supported - * for foreign dirs (as it can't be embedded into free - * format LMV, like with lum_stripe_offset), so we only - * rely on default stripe offset or then name hashing. - */ - if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL && - le32_to_cpu(lum->lum_magic != LMV_MAGIC_FOREIGN) && - le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) { - mdt = le32_to_cpu(lum->lum_stripe_offset); - } else if (op_data->op_code == LUSTRE_OPC_MKDIR && - !lmv_dir_striped(op_data->op_mea1) && - lmv_dir_qos_mkdir(op_data->op_default_mea1)) { - mdt = op_data->op_mds; - } else if (op_data->op_code == LUSTRE_OPC_MKDIR && - op_data->op_default_mea1 && - op_data->op_default_mea1->lsm_md_master_mdt_index != - (__u32)-1) { - mdt = op_data->op_default_mea1->lsm_md_master_mdt_index; - op_data->op_mds = mdt; - } else { - mdt = op_data->op_mds; - } - - RETURN(mdt); -} - -int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds) -{ struct lmv_tgt_desc *tgt; int rc; ENTRY; - tgt = lmv_tgt(lmv, mds); + LASSERT(op_data); + LASSERT(fid); + + tgt = lmv_tgt(lmv, op_data->op_mds); if (!tgt) RETURN(-ENODEV); + if (!tgt->ltd_active || !tgt->ltd_exp) + RETURN(-ENODEV); + /* * New seq alloc and FLD setup should be atomic. Otherwise we may find * on server that seq in new allocated fid is not yet known. */ mutex_lock(&tgt->ltd_fid_mutex); - - if (tgt->ltd_active == 0 || tgt->ltd_exp == NULL) - GOTO(out, rc = -ENODEV); - - /* - * Asking underlying tgt layer to allocate new fid. - */ rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL); + mutex_unlock(&tgt->ltd_fid_mutex); if (rc > 0) { LASSERT(fid_is_sane(fid)); rc = 0; } - EXIT; -out: - mutex_unlock(&tgt->ltd_fid_mutex); - return rc; -} - -int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, - struct lu_fid *fid, struct md_op_data *op_data) -{ - struct obd_device *obd = class_exp2obd(exp); - struct lmv_obd *lmv = &obd->u.lmv; - u32 mds; - int rc; - - ENTRY; - - LASSERT(op_data != NULL); - LASSERT(fid != NULL); - - mds = lmv_placement_policy(obd, op_data); - - rc = __lmv_fid_alloc(lmv, fid, mds); - if (rc) - CERROR("Can't alloc new fid, rc %d\n", rc); - RETURN(rc); } @@ -1659,8 +1586,7 @@ lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, * which is set outside, and if dir is migrating, 'op_data->op_post_migrate' * indicates whether old or new layout is used to locate. * - * For plain direcotry, normally it will locate MDT by FID, but if this - * directory has default LMV, and its hash type is "space", locate MDT with QoS. + * For plain direcotry, it just locate the MDT of op_data->op_fid1. * * \param[in] lmv LMV device * \param[in] op_data client MD stack parameters, name, namelen @@ -1683,7 +1609,7 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data) * index if the file under striped dir is being restored, see * ct_restore(). */ if (op_data->op_bias & MDS_CREATE_VOLATILE && - (int)op_data->op_mds != -1) { + op_data->op_mds != LMV_OFFSET_DEFAULT) { tgt = lmv_tgt(lmv, op_data->op_mds); if (!tgt) return ERR_PTR(-ENODEV); @@ -1711,30 +1637,7 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data) op_data->op_mds = oinfo->lmo_mds; tgt = lmv_tgt(lmv, oinfo->lmo_mds); if (!tgt) - tgt = ERR_PTR(-ENODEV); - } else if (op_data->op_code == LUSTRE_OPC_MKDIR && - lmv_dir_qos_mkdir(op_data->op_default_mea1) && - !lmv_dir_striped(lsm)) { - tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds); - if (tgt == ERR_PTR(-EAGAIN)) - tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds); - /* - * only update statfs when mkdir under dir with "space" hash, - * this means the cached statfs may be stale, and current mkdir - * may not follow QoS accurately, but it's not serious, and it - * avoids periodic statfs when client doesn't mkdir under - * "space" hashed directories. - * - * TODO: after MDT support QoS object allocation, also update - * statfs for 'lfs mkdir -i -1 ...", currently it's done in user - * space. - */ - if (!IS_ERR(tgt)) { - struct obd_device *obd; - - obd = container_of(lmv, struct obd_device, u.lmv); - lmv_statfs_check_update(obd, tgt); - } + return ERR_PTR(-ENODEV); } else { tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1, op_data->op_name, op_data->op_namelen, @@ -1787,6 +1690,78 @@ lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data) &op_data->op_mds, true); } +int lmv_migrate_existence_check(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + struct lu_tgt_desc *tgt; + struct ptlrpc_request *request; + int rc; + + LASSERT(lmv_dir_migrating(op_data->op_mea1)); + + tgt = lmv_locate_tgt(lmv, op_data); + if (IS_ERR(tgt)) + return PTR_ERR(tgt); + + rc = md_getattr_name(tgt->ltd_exp, op_data, &request); + if (!rc) { + ptlrpc_req_finished(request); + return -EEXIST; + } + + return rc; +} + +/* mkdir by QoS in two cases: + * 1. 'lfs mkdir -i -1' + * 2. parent default LMV master_mdt_index is -1 + * + * NB, mkdir by QoS only if parent is not striped, this is to avoid remote + * directories under striped directory. + */ +static inline bool lmv_op_qos_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_stripe_md *lsm = op_data->op_default_mea1; + const struct lmv_user_md *lum = op_data->op_data; + + if (op_data->op_code != LUSTRE_OPC_MKDIR) + return false; + + if (lmv_dir_striped(op_data->op_mea1)) + return false; + + if (op_data->op_cli_flags & CLI_SET_MEA && lum && + (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC || + le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) && + le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT) + return true; + + if (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT) + return true; + + return false; +} + +/* 'lfs mkdir -i ' */ +static inline bool lmv_op_user_specific_mkdir(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + return op_data->op_code == LUSTRE_OPC_MKDIR && + op_data->op_cli_flags & CLI_SET_MEA && lum && + (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC || + le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) && + le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT; +} + +/* parent default LMV master_mdt_index is not -1. */ +static inline bool +lmv_op_default_specific_mkdir(const struct md_op_data *op_data) +{ + return op_data->op_code == LUSTRE_OPC_MKDIR && + op_data->op_default_mea1 && + op_data->op_default_mea1->lsm_md_master_mdt_index != + LMV_OFFSET_DEFAULT; +} int lmv_create(struct obd_export *exp, struct md_op_data *op_data, const void *data, size_t datalen, umode_t mode, uid_t uid, gid_t gid, cfs_cap_t cap_effective, __u64 rdev, @@ -1808,20 +1783,9 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, if (lmv_dir_migrating(op_data->op_mea1)) { /* * if parent is migrating, create() needs to lookup existing - * name, to avoid creating new file under old layout of - * migrating directory, check old layout here. + * name in both old and new layout, check old layout on client. */ - tgt = lmv_locate_tgt(lmv, op_data); - if (IS_ERR(tgt)) - RETURN(PTR_ERR(tgt)); - - rc = md_getattr_name(tgt->ltd_exp, op_data, request); - if (!rc) { - ptlrpc_req_finished(*request); - *request = NULL; - RETURN(-EEXIST); - } - + rc = lmv_migrate_existence_check(lmv, op_data); if (rc != -ENOENT) RETURN(rc); @@ -1832,26 +1796,44 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); - CDEBUG(D_INODE, "CREATE name '%.*s' on "DFID" -> mds #%x\n", - (int)op_data->op_namelen, op_data->op_name, - PFID(&op_data->op_fid1), op_data->op_mds); + if (lmv_op_qos_mkdir(op_data)) { + tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds); + if (tgt == ERR_PTR(-EAGAIN)) + tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds); + /* + * only update statfs after QoS mkdir, this means the cached + * statfs may be stale, and current mkdir may not follow QoS + * accurately, but it's not serious, and avoids periodic statfs + * when client doesn't mkdir by QoS. + */ + if (!IS_ERR(tgt)) + lmv_statfs_check_update(obd, tgt); + } else if (lmv_op_user_specific_mkdir(op_data)) { + struct lmv_user_md *lum = op_data->op_data; + + op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset); + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + RETURN(-ENODEV); + } else if (lmv_op_default_specific_mkdir(op_data)) { + op_data->op_mds = + op_data->op_default_mea1->lsm_md_master_mdt_index; + tgt = lmv_tgt(lmv, op_data->op_mds); + if (!tgt) + RETURN(-ENODEV); + } + + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); if (rc) RETURN(rc); - if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) { - /* Send the create request to the MDT where the object - * will be located */ - tgt = lmv_fid2tgt(lmv, &op_data->op_fid2); - if (IS_ERR(tgt)) - RETURN(PTR_ERR(tgt)); - - op_data->op_mds = tgt->ltd_index; - } - - CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n", - PFID(&op_data->op_fid2), op_data->op_mds); + CDEBUG(D_INODE, "CREATE name '%.*s' "DFID" on "DFID" -> mds #%x\n", + (int)op_data->op_namelen, op_data->op_name, + PFID(&op_data->op_fid2), PFID(&op_data->op_fid1), + op_data->op_mds); op_data->op_flags |= MF_MDC_CANCEL_FID1; rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid, @@ -2107,10 +2089,20 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, if (IS_ERR(child_tgt)) RETURN(PTR_ERR(child_tgt)); - if (!S_ISDIR(op_data->op_mode) && tp_tgt) - rc = __lmv_fid_alloc(lmv, &target_fid, tp_tgt->ltd_index); - else - rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data); + /* for directory, migrate to MDT specified by lum_stripe_offset; + * otherwise migrate to the target stripe of parent, but parent + * directory may have finished migration (normally current file too), + * allocate FID on MDT lum_stripe_offset, and server will check + * whether file was migrated already. + */ + if (S_ISDIR(op_data->op_mode) || !tp_tgt) { + struct lmv_user_md *lum = op_data->op_data; + + op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset); + } else { + op_data->op_mds = tp_tgt->ltd_index; + } + rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data); if (rc) RETURN(rc); @@ -3127,7 +3119,7 @@ static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm, * set default value -1, so lmv_locate_tgt() knows this stripe * target is not initialized. */ - lsm->lsm_md_oinfo[i].lmo_mds = (u32)-1; + lsm->lsm_md_oinfo[i].lmo_mds = LMV_OFFSET_DEFAULT; if (!fid_is_sane(&lsm->lsm_md_oinfo[i].lmo_fid)) continue; diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index bf517f9..65ac3b9 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -578,6 +578,7 @@ void lod_fix_desc_qos_maxage(__u32 *val); void lod_fix_desc_pattern(__u32 *val); void lod_fix_desc_stripe_count(__u32 *val); void lod_fix_desc_stripe_size(__u64 *val); +void lod_fix_lmv_desc_pattern(__u32 *val); int lod_pools_init(struct lod_device *m, struct lustre_cfg *cfg); int lod_pools_fini(struct lod_device *m); int lod_parse_striping(const struct lu_env *env, struct lod_object *mo, @@ -598,14 +599,14 @@ int lod_alloc_comp_entries(struct lod_object *lo, int mirror_cnt, int comp_cnt); int lod_fill_mirrors(struct lod_object *lo); /* lod_pool.c */ -int lod_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count); -int lod_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx); -int lod_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count); +int lod_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count); +int lod_tgt_pool_free(struct lu_tgt_pool *op); +int lod_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count); +int lod_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx); +int lod_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count); struct pool_desc *lod_find_pool(struct lod_device *lod, char *poolname); void lod_pool_putref(struct pool_desc *pool); -int lod_ost_pool_free(struct lu_tgt_pool *op); int lod_pool_del(struct obd_device *obd, char *poolname); -int lod_ost_pool_init(struct lu_tgt_pool *op, unsigned int count); extern struct cfs_hash_ops pool_hash_operations; int lod_check_index_in_pool(__u32 idx, struct pool_desc *pool); int lod_pool_new(struct obd_device *obd, char *poolname); @@ -637,6 +638,10 @@ struct lod_obj_stripe_cb_data { }; /* lod_qos.c */ +int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripes); +int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripe); int lod_prepare_create(const struct lu_env *env, struct lod_object *lo, struct lu_attr *attr, const struct lu_buf *buf, struct thandle *th); @@ -652,7 +657,8 @@ __u16 lod_comp_entry_stripe_count(struct lod_object *lo, bool is_dir); __u16 lod_get_stripe_count(struct lod_device *lod, struct lod_object *lo, __u16 stripe_count, bool overstriping); -void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod); +void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod, + struct lu_tgt_descs *ltd); /* lproc_lod.c */ int lod_procfs_init(struct lod_device *lod); diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c index 6590d3c..e449db6 100644 --- a/lustre/lod/lod_lov.c +++ b/lustre/lod/lod_lov.c @@ -89,10 +89,8 @@ void lod_putref(struct lod_device *lod, struct lod_tgt_descs *ltd) continue; list_add(&tgt_desc->ltd_kill, &kill); - /*FIXME: only support ost pool for now */ - if (ltd == &lod->lod_ost_descs) - lod_ost_pool_remove(<d->ltd_tgt_pool, - tgt_desc->ltd_index); + lod_tgt_pool_remove(<d->ltd_tgt_pool, + tgt_desc->ltd_index); ltd_del_tgt(ltd, tgt_desc); ltd->ltd_death_row--; } @@ -256,15 +254,12 @@ int lod_add_device(const struct lu_env *env, struct lod_device *lod, if (rc) GOTO(out_del_tgt, rc); - if (for_ost) { - /* pool is not supported for MDS stack yet */ - rc = lod_ost_pool_add(<d->ltd_tgt_pool, index, - ltd->ltd_tgts_size); - if (rc) { - CERROR("%s: can't set up pool, failed with %d\n", - obd->obd_name, rc); - GOTO(out_del_tgt, rc); - } + rc = lod_tgt_pool_add(<d->ltd_tgt_pool, index, + ltd->ltd_lov_desc.ld_tgt_count); + if (rc) { + CERROR("%s: can't set up pool, failed with %d\n", + obd->obd_name, rc); + GOTO(out_del_tgt, rc); } mutex_unlock(<d->ltd_mutex); @@ -301,7 +296,7 @@ out_ltd: thread = LTD_TGT(ltd, index)->ltd_recovery_thread; OBD_FREE_PTR(thread); } - lod_ost_pool_remove(<d->ltd_tgt_pool, index); + lod_tgt_pool_remove(<d->ltd_tgt_pool, index); out_del_tgt: ltd_del_tgt(ltd, tgt_desc); out_mutex: @@ -2006,6 +2001,14 @@ void lod_fix_desc_pattern(__u32 *val) } } +void lod_fix_lmv_desc_pattern(__u32 *val) +{ + if ((*val) && !lmv_is_known_hash_type(*val)) { + LCONSOLE_WARN("lod: Unknown md stripe pattern: %#x\n", *val); + *val = 0; + } +} + void lod_fix_desc_qos_maxage(__u32 *val) { /* fix qos_maxage */ @@ -2026,6 +2029,14 @@ void lod_fix_desc(struct lov_desc *desc) lod_fix_desc_qos_maxage(&desc->ld_qos_maxage); } +static void lod_fix_lmv_desc(struct lmv_desc *desc) +{ + desc->ld_active_tgt_count = 0; + lod_fix_desc_stripe_count(&desc->ld_default_stripe_count); + lod_fix_lmv_desc_pattern(&desc->ld_pattern); + lod_fix_desc_qos_maxage(&desc->ld_qos_maxage); +} + /** * Initialize the structures used to store pools and default striping. * @@ -2076,6 +2087,9 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg) desc->ld_active_tgt_count = 0; lod->lod_ost_descs.ltd_lov_desc = *desc; + /* NB: config doesn't contain lmv_desc, alter it via sysfs. */ + lod_fix_lmv_desc(&lod->lod_mdt_descs.ltd_lmv_desc); + lod->lod_sp_me = LUSTRE_SP_CLI; /* Set up OST pool environment */ @@ -2091,17 +2105,30 @@ int lod_pools_init(struct lod_device *lod, struct lustre_cfg *lcfg) INIT_LIST_HEAD(&lod->lod_pool_list); lod->lod_pool_count = 0; - rc = lod_ost_pool_init(&lod->lod_ost_descs.ltd_tgt_pool, 0); + rc = lod_tgt_pool_init(&lod->lod_mdt_descs.ltd_tgt_pool, 0); if (rc) GOTO(out_hash, rc); - rc = lod_ost_pool_init(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool, 0); + + rc = lod_tgt_pool_init(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool, 0); + if (rc) + GOTO(out_mdt_pool, rc); + + rc = lod_tgt_pool_init(&lod->lod_ost_descs.ltd_tgt_pool, 0); + if (rc) + GOTO(out_mdt_rr_pool, rc); + + rc = lod_tgt_pool_init(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool, 0); if (rc) - GOTO(out_pool_info, rc); + GOTO(out_ost_pool, rc); RETURN(0); -out_pool_info: - lod_ost_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); +out_ost_pool: + lod_tgt_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); +out_mdt_rr_pool: + lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool); +out_mdt_pool: + lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_tgt_pool); out_hash: cfs_hash_putref(lod->lod_pools_hash_body); @@ -2131,8 +2158,10 @@ int lod_pools_fini(struct lod_device *lod) } cfs_hash_putref(lod->lod_pools_hash_body); - lod_ost_pool_free(&(lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool)); - lod_ost_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); + lod_tgt_pool_free(&lod->lod_ost_descs.ltd_qos.lq_rr.lqr_pool); + lod_tgt_pool_free(&lod->lod_ost_descs.ltd_tgt_pool); + lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_qos.lq_rr.lqr_pool); + lod_tgt_pool_free(&lod->lod_mdt_descs.ltd_tgt_pool); RETURN(0); } diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index c7c2a3a..f25a490 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -1945,74 +1945,73 @@ out: RETURN(rc); } -static int lod_prep_md_striped_create(const struct lu_env *env, - struct dt_object *dt, - struct lu_attr *attr, - const struct lmv_user_md_v1 *lum, - struct dt_object_format *dof, - struct thandle *th) +/** + * Allocate a striping on a predefined set of MDTs. + * + * Allocates new striping using the MDT index range provided by the data from + * the lum_obejcts contained in the lmv_user_md passed to this method if + * \a is_specific is true; or allocates new layout starting from MDT index in + * lo->ldo_dir_stripe_offset. The exact order of MDTs is not important and + * varies depending on MDT status. The number of stripes needed and stripe + * offset are taken from the object. If that number cannot be met, then the + * function returns an error and then it's the caller's responsibility to + * release the stripes allocated. All the internal structures are protected, + * but no concurrent allocation is allowed on the same objects. + * + * \param[in] env execution environment for this thread + * \param[in] lo LOD object + * \param[out] stripes striping created + * \param[out] mdt_indices MDT indices of striping created + * \param[in] is_specific true if the MDTs are provided by lum; false if + * only the starting MDT index is provided + * + * \retval positive stripes allocated, including the first stripe allocated + * outside + * \retval negative errno on failure + */ +static int lod_mdt_alloc_specific(const struct lu_env *env, + struct lod_object *lo, + struct dt_object **stripes, + __u32 *mdt_indices, bool is_specific) { struct lod_thread_info *info = lod_env_info(env); - struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev); - struct lod_tgt_descs *ltd = &lod->lod_mdt_descs; - struct lod_object *lo = lod_dt_obj(dt); - struct dt_object **stripe; - __u32 stripe_count; - int *idx_array; - __u32 master_index; - int rc = 0; - __u32 i; - __u32 j; - bool is_specific = false; - ENTRY; - - /* The lum has been verifed in lod_verify_md_striping */ - LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC || - le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC); - - stripe_count = lo->ldo_dir_stripe_count; - - OBD_ALLOC(idx_array, sizeof(idx_array[0]) * stripe_count); - if (idx_array == NULL) - RETURN(-ENOMEM); - - OBD_ALLOC(stripe, sizeof(stripe[0]) * stripe_count); - if (stripe == NULL) - GOTO(out_free, rc = -ENOMEM); + struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + struct lu_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lu_tgt_desc *tgt = NULL; + struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; + struct dt_device *tgt_dt = NULL; + struct lu_fid fid = { 0 }; + struct dt_object *dto; + u32 master_index; + u32 stripe_count = lo->ldo_dir_stripe_count; + int stripe_idx = 1; + int j; + int idx; + int rc; - /* Start index must be the master MDT */ master_index = lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id; - idx_array[0] = master_index; - if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) { - is_specific = true; - for (i = 1; i < stripe_count; i++) - idx_array[i] = le32_to_cpu(lum->lum_objects[i].lum_mds); - } - - for (i = 0; i < stripe_count; i++) { - struct lod_tgt_desc *tgt = NULL; - struct dt_object *dto; - struct lu_fid fid = { 0 }; - int idx; - struct lu_object_conf conf = { 0 }; - struct dt_device *tgt_dt = NULL; + if (stripe_count > 1) + /* Set the start index for the 2nd stripe allocation */ + mdt_indices[1] = (mdt_indices[0] + 1) % + (lod->lod_remote_mdt_count + 1); + for (; stripe_idx < stripe_count; stripe_idx++) { /* Try to find next avaible target */ - idx = idx_array[i]; + idx = mdt_indices[stripe_idx]; for (j = 0; j < lod->lod_remote_mdt_count; j++, idx = (idx + 1) % (lod->lod_remote_mdt_count + 1)) { bool already_allocated = false; __u32 k; CDEBUG(D_INFO, "try idx %d, mdt cnt %u, allocated %u\n", - idx, lod->lod_remote_mdt_count + 1, i); + idx, lod->lod_remote_mdt_count + 1, stripe_idx); if (likely(!is_specific && !OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE))) { /* check whether the idx already exists * in current allocated array */ - for (k = 0; k < i; k++) { - if (idx_array[k] == idx) { + for (k = 0; k < stripe_idx; k++) { + if (mdt_indices[k] == idx) { already_allocated = true; break; } @@ -2033,29 +2032,25 @@ static int lod_prep_md_striped_create(const struct lu_env *env, rc = obd_fid_alloc(env, lod->lod_child_exp, &fid, NULL); if (rc < 0) - GOTO(out_put, rc); + continue; tgt_dt = lod->lod_child; break; } /* check the status of the OSP */ tgt = LTD_TGT(ltd, idx); - if (tgt == NULL) + if (!tgt) continue; tgt_dt = tgt->ltd_tgt; rc = dt_statfs(env, tgt_dt, &info->lti_osfs); - if (rc) { + if (rc) /* this OSP doesn't feel well */ - rc = 0; continue; - } rc = obd_fid_alloc(env, tgt->ltd_exp, &fid, NULL); - if (rc < 0) { - rc = 0; + if (rc < 0) continue; - } break; } @@ -2063,15 +2058,16 @@ static int lod_prep_md_striped_create(const struct lu_env *env, /* Can not allocate more stripes */ if (j == lod->lod_remote_mdt_count) { CDEBUG(D_INFO, "%s: require stripes %u only get %d\n", - lod2obd(lod)->obd_name, stripe_count, i); + lod2obd(lod)->obd_name, stripe_count, + stripe_idx); break; } CDEBUG(D_INFO, "Get idx %d, for stripe %d "DFID"\n", - idx, i, PFID(&fid)); - idx_array[i] = idx; + idx, stripe_idx, PFID(&fid)); + mdt_indices[stripe_idx] = idx; /* Set the start index for next stripe allocation */ - if (!is_specific && i < stripe_count - 1) { + if (!is_specific && stripe_idx < stripe_count - 1) { /* * for large dir test, put all other slaves on one * remote MDT, otherwise we may save too many local @@ -2079,7 +2075,7 @@ static int lod_prep_md_striped_create(const struct lu_env *env, */ if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE))) idx = master_index; - idx_array[i + 1] = (idx + 1) % + mdt_indices[stripe_idx + 1] = (idx + 1) % (lod->lod_remote_mdt_count + 1); } /* tgt_dt and fid must be ready after search avaible OSP @@ -2088,47 +2084,124 @@ static int lod_prep_md_striped_create(const struct lu_env *env, LASSERT(fid_is_sane(&fid)); /* fail a remote stripe FID allocation */ - if (i && OBD_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_FID)) + if (stripe_idx && OBD_FAIL_CHECK(OBD_FAIL_MDS_STRIPE_FID)) continue; - conf.loc_flags = LOC_F_NEW; dto = dt_locate_at(env, tgt_dt, &fid, - dt->do_lu.lo_dev->ld_site->ls_top_dev, - &conf); - if (IS_ERR(dto)) - GOTO(out_put, rc = PTR_ERR(dto)); - stripe[i] = dto; + lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev, + &conf); + if (IS_ERR(dto)) { + rc = PTR_ERR(dto); + goto error; + } + + stripes[stripe_idx] = dto; } + return stripe_idx; + +error: + for (j = 1; j < stripe_idx; j++) { + LASSERT(stripes[j] != NULL); + dt_object_put(env, stripes[j]); + stripes[j] = NULL; + } + return rc; +} + +static int lod_prep_md_striped_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + const struct lmv_user_md_v1 *lum, + struct dt_object_format *dof, + struct thandle *th) +{ + struct lod_device *lod = lu2lod_dev(dt->do_lu.lo_dev); + struct lod_object *lo = lod_dt_obj(dt); + struct dt_object **stripes; + struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; + struct lu_fid fid = { 0 }; + __u32 stripe_count; + int i; + int rc = 0; + + ENTRY; + + /* The lum has been verifed in lod_verify_md_striping */ + LASSERT(le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC || + le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC); + + stripe_count = lo->ldo_dir_stripe_count; + + OBD_ALLOC(stripes, sizeof(stripes[0]) * stripe_count); + if (!stripes) + RETURN(-ENOMEM); + + /* Allocate the first stripe locally */ + rc = obd_fid_alloc(env, lod->lod_child_exp, &fid, NULL); + if (rc < 0) + GOTO(out, rc); + + stripes[0] = dt_locate_at(env, lod->lod_child, &fid, + dt->do_lu.lo_dev->ld_site->ls_top_dev, &conf); + if (IS_ERR(stripes[0])) + GOTO(out, rc = PTR_ERR(stripes[0])); + + if (lo->ldo_dir_stripe_offset == LMV_OFFSET_DEFAULT) { + lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs); + rc = lod_mdt_alloc_qos(env, lo, stripes); + if (rc == -EAGAIN) + rc = lod_mdt_alloc_rr(env, lo, stripes); + } else { + int *idx_array; + bool is_specific = false; + + OBD_ALLOC(idx_array, sizeof(idx_array[0]) * stripe_count); + if (!idx_array) + GOTO(out, rc = -ENOMEM); + + if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) { + is_specific = true; + for (i = 0; i < stripe_count; i++) + idx_array[i] = + le32_to_cpu(lum->lum_objects[i].lum_mds); + } + + /* stripe 0 is local */ + idx_array[0] = + lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id; + rc = lod_mdt_alloc_specific(env, lo, stripes, idx_array, + is_specific); + OBD_FREE(idx_array, sizeof(idx_array[0]) * stripe_count); + } + + if (rc < 0) + GOTO(out, rc); + + LASSERT(rc > 0); + lo->ldo_dir_striped = 1; - lo->ldo_stripe = stripe; - lo->ldo_dir_stripe_count = i; + lo->ldo_stripe = stripes; + lo->ldo_dir_stripe_count = rc; lo->ldo_dir_stripes_allocated = stripe_count; smp_mb(); lo->ldo_dir_stripe_loaded = 1; - if (lo->ldo_dir_stripe_count == 0) - GOTO(out_put, rc = -ENOSPC); - rc = lod_dir_declare_create_stripes(env, dt, attr, dof, th); - if (rc != 0) - GOTO(out_put, rc); + if (rc < 0) + lod_striping_free(env, lo); -out_put: - if (rc < 0) { - for (i = 0; i < stripe_count; i++) - if (stripe[i] != NULL) - dt_object_put(env, stripe[i]); - OBD_FREE(stripe, sizeof(stripe[0]) * stripe_count); - lo->ldo_dir_stripe_count = 0; - lo->ldo_dir_stripes_allocated = 0; - lo->ldo_stripe = NULL; - } + RETURN(rc); -out_free: - OBD_FREE(idx_array, sizeof(idx_array[0]) * stripe_count); +out: + LASSERT(rc < 0); + if (!IS_ERR_OR_NULL(stripes[0])) + dt_object_put(env, stripes[0]); + for (i = 1; i < stripe_count; i++) + LASSERT(!stripes[i]); + OBD_FREE(stripes, sizeof(stripes[0]) * stripe_count); - RETURN(rc); + return rc; } /** @@ -3770,8 +3843,7 @@ static int lod_xattr_set_default_lmv_on_dir(const struct lu_env *env, if (LMVEA_DELETE_VALUES((le32_to_cpu(lum->lum_stripe_count)), le32_to_cpu(lum->lum_stripe_offset)) && - le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC && - !(le32_to_cpu(lum->lum_hash_type) & LMV_HASH_FLAG_SPACE)) { + le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC) { rc = lod_xattr_del_internal(env, dt, name, th); if (rc == -ENODATA) rc = 0; @@ -5136,8 +5208,7 @@ static void lod_striping_from_default(struct lod_object *lo, lo->ldo_dir_stripe_offset = lds->lds_dir_def_stripe_offset; if (lo->ldo_dir_hash_type == 0) - lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type & - ~LMV_HASH_FLAG_SPACE; + lo->ldo_dir_hash_type = lds->lds_dir_def_hash_type; CDEBUG(D_LAYOUT, "striping from default dir: count:%hu, " "offset:%u, hash_type:%u\n", @@ -5593,7 +5664,7 @@ static inline int dt_object_qos_mkdir(const struct lu_env *env, return -EINVAL; lmu = info->lti_ea_store; - return !!(le32_to_cpu(lmu->lum_hash_type) & LMV_HASH_FLAG_SPACE); + return le32_to_cpu(lmu->lum_stripe_offset) == LMV_OFFSET_DEFAULT; } /** @@ -5659,19 +5730,22 @@ static int lod_declare_create(const struct lu_env *env, struct dt_object *dt, if (OBD_FAIL_CHECK(OBD_FAIL_MDS_STALE_DIR_LAYOUT)) GOTO(out, rc = -EREMOTE); - if (lo->ldo_dir_stripe_offset == -1) { + if (lo->ldo_dir_stripe_offset == LMV_OFFSET_DEFAULT) { + struct lod_default_striping *lds; + + lds = lo->ldo_def_striping; /* - * child and parent should be in the same MDT, - * but if parent has plain layout, it's allowed. + * child and parent should be on the same MDT, + * but if parent has default LMV, and the start + * MDT offset is -1, it's allowed. This check + * is not necessary after 2.12.22 because client + * follows this already, but old client may not. */ if (hint->dah_parent && - dt_object_remote(hint->dah_parent)) { - rc = dt_object_qos_mkdir(env, - lo->ldo_obj.do_lu.lo_dev, - hint->dah_parent); - if (rc <= 0) - GOTO(out, rc ? rc : -EREMOTE); - } + dt_object_remote(hint->dah_parent) && lds && + lds->lds_dir_def_stripe_offset != + LMV_OFFSET_DEFAULT) + GOTO(out, rc = -EREMOTE); } else if (lo->ldo_dir_stripe_offset != ss->ss_node_id) { struct lod_device *lod; @@ -7178,7 +7252,7 @@ static int lod_primary_pick(const struct lu_env *env, struct lod_object *lo, * This algo can be revised later after knowing the topology of * cluster. */ - lod_qos_statfs_update(env, lod); + lod_qos_statfs_update(env, lod, &lod->lod_ost_descs); for (i = 0; i < lo->ldo_mirror_count; i++) { bool ost_avail = true; int index = (i + seq) % lo->ldo_mirror_count; diff --git a/lustre/lod/lod_pool.c b/lustre/lod/lod_pool.c index eb23de4..51cff07 100644 --- a/lustre/lod/lod_pool.c +++ b/lustre/lod/lod_pool.c @@ -99,8 +99,8 @@ void lod_pool_putref(struct pool_desc *pool) LASSERT(hlist_unhashed(&pool->pool_hash)); LASSERT(list_empty(&pool->pool_list)); LASSERT(pool->pool_proc_entry == NULL); - lod_ost_pool_free(&(pool->pool_rr.lqr_pool)); - lod_ost_pool_free(&(pool->pool_obds)); + lod_tgt_pool_free(&(pool->pool_rr.lqr_pool)); + lod_tgt_pool_free(&(pool->pool_obds)); OBD_FREE_PTR(pool); EXIT; } @@ -464,7 +464,7 @@ void lod_dump_pool(int level, struct pool_desc *pool) * \retval negative error number on failure */ #define POOL_INIT_COUNT 2 -int lod_ost_pool_init(struct lu_tgt_pool *op, unsigned int count) +int lod_tgt_pool_init(struct lu_tgt_pool *op, unsigned int count) { ENTRY; @@ -496,7 +496,7 @@ int lod_ost_pool_init(struct lu_tgt_pool *op, unsigned int count) * \retval 0 on success * \retval negative error number on failure. */ -int lod_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count) +int lod_tgt_pool_extend(struct lu_tgt_pool *op, unsigned int min_count) { __u32 *new; __u32 new_size; @@ -534,7 +534,7 @@ int lod_ost_pool_extend(struct lu_tgt_pool *op, unsigned int min_count) * \retval 0 if target could be added to the pool * \retval negative error if target \a idx was not added */ -int lod_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count) +int lod_tgt_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count) { unsigned int i; int rc = 0; @@ -542,7 +542,7 @@ int lod_ost_pool_add(struct lu_tgt_pool *op, __u32 idx, unsigned int min_count) down_write(&op->op_rw_sem); - rc = lod_ost_pool_extend(op, min_count); + rc = lod_tgt_pool_extend(op, min_count); if (rc) GOTO(out, rc); @@ -574,7 +574,7 @@ out: * \retval 0 on success * \retval negative error number on failure */ -int lod_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx) +int lod_tgt_pool_remove(struct lu_tgt_pool *op, __u32 idx) { unsigned int i; ENTRY; @@ -608,7 +608,7 @@ int lod_ost_pool_remove(struct lu_tgt_pool *op, __u32 idx) * * \retval 0 on success or if pool was already freed */ -int lod_ost_pool_free(struct lu_tgt_pool *op) +int lod_tgt_pool_free(struct lu_tgt_pool *op) { ENTRY; @@ -657,13 +657,13 @@ int lod_pool_new(struct obd_device *obd, char *poolname) strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name)); new_pool->pool_lobd = obd; atomic_set(&new_pool->pool_refcount, 1); - rc = lod_ost_pool_init(&new_pool->pool_obds, 0); + rc = lod_tgt_pool_init(&new_pool->pool_obds, 0); if (rc) GOTO(out_err, rc); lu_qos_rr_init(&new_pool->pool_rr); - rc = lod_ost_pool_init(&new_pool->pool_rr.lqr_pool, 0); + rc = lod_tgt_pool_init(&new_pool->pool_rr.lqr_pool, 0); if (rc) GOTO(out_free_pool_obds, rc); @@ -708,9 +708,9 @@ out_err: lprocfs_remove(&new_pool->pool_proc_entry); - lod_ost_pool_free(&new_pool->pool_rr.lqr_pool); + lod_tgt_pool_free(&new_pool->pool_rr.lqr_pool); out_free_pool_obds: - lod_ost_pool_free(&new_pool->pool_obds); + lod_tgt_pool_free(&new_pool->pool_obds); OBD_FREE_PTR(new_pool); return rc; } @@ -791,8 +791,8 @@ int lod_pool_add(struct obd_device *obd, char *poolname, char *ostname) if (rc) GOTO(out, rc); - rc = lod_ost_pool_add(&pool->pool_obds, tgt->ltd_index, - lod->lod_ost_descs.ltd_tgts_size); + rc = lod_tgt_pool_add(&pool->pool_obds, tgt->ltd_index, + lod->lod_ost_count); if (rc) GOTO(out, rc); @@ -849,8 +849,7 @@ int lod_pool_remove(struct obd_device *obd, char *poolname, char *ostname) if (rc) GOTO(out, rc); - lod_ost_pool_remove(&pool->pool_obds, ost->ltd_index); - + lod_tgt_pool_remove(&pool->pool_obds, ost->ltd_index); pool->pool_rr.lqr_dirty = 1; CDEBUG(D_CONFIG, "%s removed from "LOV_POOLNAMEF"\n", ostname, diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 1892dee..149b06f 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -60,11 +60,32 @@ #define TGT_BAVAIL(i) (OST_TGT(lod,i)->ltd_statfs.os_bavail * \ OST_TGT(lod,i)->ltd_statfs.os_bsize) +static inline int lod_statfs_check(struct lu_tgt_descs *ltd, + struct lu_tgt_desc *tgt) +{ + struct obd_statfs *sfs = &tgt->ltd_statfs; + + if (((sfs->os_state & OS_STATE_ENOSPC) || + (!ltd->ltd_is_mdt && sfs->os_state & OS_STATE_ENOINO && + sfs->os_fprecreated == 0))) + return -ENOSPC; + + /* If the OST is readonly then we can't allocate objects there */ + if (sfs->os_state & OS_STATE_READONLY) + return -EROFS; + + /* object precreation is skipped on the OST with max_create_count=0 */ + if (!ltd->ltd_is_mdt && sfs->os_state & OS_STATE_NOPRECREATE) + return -ENOBUFS; + + return 0; +} + /** - * Check whether the target is available for new OST objects. + * Check whether the target is available for new objects. * * Request statfs data from the given target and verify it's active and not - * read-only. If so, then it can be used to place new OST objects. This + * read-only. If so, then it can be used to place new objects. This * function also maintains the number of active/inactive targets and sets * dirty flags if those numbers change so others can run re-balance procedures. * No external locking is required. @@ -72,42 +93,30 @@ * \param[in] env execution environment for this thread * \param[in] d LOD device * \param[in] ltd target table - * \param[in] index target index - * \param[out] sfs buffer for statfs data + * \param[in] tgt target * * \retval 0 if the target is good * \retval negative negated errno on error - */ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, - struct lu_tgt_descs *ltd, int index, - struct obd_statfs *sfs) + struct lu_tgt_descs *ltd, + struct lu_tgt_desc *tgt) { struct lov_desc *desc = <d->ltd_lov_desc; - struct lu_tgt_desc *tgt = LTD_TGT(ltd, index); int rc; - ENTRY; - LASSERT(d); LASSERT(tgt); - rc = dt_statfs(env, tgt->ltd_tgt, sfs); - - if (rc == 0 && ((sfs->os_state & OS_STATE_ENOSPC) || - (sfs->os_state & OS_STATE_ENOINO && sfs->os_fprecreated == 0))) - RETURN(-ENOSPC); - + rc = dt_statfs(env, tgt->ltd_tgt, &tgt->ltd_statfs); if (rc && rc != -ENOTCONN) CERROR("%s: statfs: rc = %d\n", lod2obd(d)->obd_name, rc); - /* If the OST is readonly then we can't allocate objects there */ - if (sfs->os_state & OS_STATE_READONLY) - rc = -EROFS; - - /* object precreation is skipped on the OST with max_create_count=0 */ - if (sfs->os_state & OS_STATE_NOPRECREATE) - rc = -ENOBUFS; + if (!rc) { + rc = lod_statfs_check(ltd, tgt); + if (rc == -ENOSPC) + return rc; + } /* check whether device has changed state (active, inactive) */ if (rc != 0 && tgt->ltd_active) { @@ -144,7 +153,21 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, spin_unlock(&d->lod_lock); } - RETURN(rc); + return rc; +} + +static int lod_is_tgt_usable(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt) +{ + int rc; + + rc = lod_statfs_check(ltd, tgt); + if (rc) + return rc; + + if (!tgt->ltd_active) + return -ENOTCONN; + + return 0; } /** @@ -156,43 +179,41 @@ static int lod_statfs_and_check(const struct lu_env *env, struct lod_device *d, * * \param[in] env execution environment for this thread * \param[in] lod LOD device + * \param[in] ltd tgt table */ -void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod) +void lod_qos_statfs_update(const struct lu_env *env, struct lod_device *lod, + struct lu_tgt_descs *ltd) { struct obd_device *obd = lod2obd(lod); - struct lu_tgt_pool *osts = &lod->lod_ost_descs.ltd_tgt_pool; + struct lu_tgt_desc *tgt; time64_t max_age; - unsigned int i; u64 avail; - int idx; ENTRY; - max_age = ktime_get_seconds() - - 2 * lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage; + max_age = ktime_get_seconds() - 2 * ltd->ltd_lov_desc.ld_qos_maxage; if (obd->obd_osfs_age > max_age) /* statfs data are quite recent, don't need to refresh it */ RETURN_EXIT; - down_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); + down_write(<d->ltd_qos.lq_rw_sem); if (obd->obd_osfs_age > max_age) goto out; - for (i = 0; i < osts->op_count; i++) { - idx = osts->op_array[i]; - avail = OST_TGT(lod,idx)->ltd_statfs.os_bavail; - if (lod_statfs_and_check(env, lod, &lod->lod_ost_descs, idx, - &OST_TGT(lod, idx)->ltd_statfs)) + ltd_foreach_tgt(ltd, tgt) { + avail = tgt->ltd_statfs.os_bavail; + if (lod_statfs_and_check(env, lod, ltd, tgt)) continue; - if (OST_TGT(lod,idx)->ltd_statfs.os_bavail != avail) + + if (tgt->ltd_statfs.os_bavail != avail) /* recalculate weigths */ - lod->lod_ost_descs.ltd_qos.lq_dirty = 1; + ltd->ltd_qos.lq_dirty = 1; } obd->obd_osfs_age = ktime_get_seconds(); out: - up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); + up_write(<d->ltd_qos.lq_rw_sem); EXIT; } @@ -208,17 +229,19 @@ out: * a new target or activation/deactivation). * * \param[in] lod LOD device - * \param[in] src_pool OST pool + * \param[in] ltd tgt table + * \param[in] src_pool tgt pool * \param[in] lqr round-robin list * * \retval 0 on success * \retval -ENOMEM fails to allocate the array */ -static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool, +static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_descs *ltd, + const struct lu_tgt_pool *src_pool, struct lu_qos_rr *lqr) { - struct lu_svr_qos *oss; - struct lod_tgt_desc *ost; + struct lu_svr_qos *svr; + struct lu_tgt_desc *tgt; unsigned placed, real_count; unsigned int i; int rc; @@ -230,7 +253,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool, } /* Do actual allocation. */ - down_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); + down_write(<d->ltd_qos.lq_rw_sem); /* * Check again. While we were sleeping on @lq_rw_sem something could @@ -238,7 +261,7 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool, */ if (!lqr->lqr_dirty) { LASSERT(lqr->lqr_pool.op_size); - up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); + up_write(<d->ltd_qos.lq_rw_sem); RETURN(0); } @@ -249,34 +272,33 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool, deleting from the pool. The lq_rw_sem insures that nobody else is reading. */ lqr->lqr_pool.op_count = real_count; - rc = lod_ost_pool_extend(&lqr->lqr_pool, real_count); + rc = lod_tgt_pool_extend(&lqr->lqr_pool, real_count); if (rc) { - up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); + up_write(<d->ltd_qos.lq_rw_sem); RETURN(rc); } for (i = 0; i < lqr->lqr_pool.op_count; i++) lqr->lqr_pool.op_array[i] = LOV_QOS_EMPTY; - /* Place all the OSTs from 1 OSS at the same time. */ + /* Place all the tgts from 1 svr at the same time. */ placed = 0; - list_for_each_entry(oss, &lod->lod_ost_descs.ltd_qos.lq_svr_list, - lsq_svr_list) { + list_for_each_entry(svr, <d->ltd_qos.lq_svr_list, lsq_svr_list) { int j = 0; for (i = 0; i < lqr->lqr_pool.op_count; i++) { int next; - if (!cfs_bitmap_check(lod->lod_ost_bitmap, - src_pool->op_array[i])) + if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, + src_pool->op_array[i])) continue; - ost = OST_TGT(lod,src_pool->op_array[i]); - LASSERT(ost && ost->ltd_tgt); - if (ost->ltd_qos.ltq_svr != oss) + tgt = LTD_TGT(ltd, src_pool->op_array[i]); + LASSERT(tgt && tgt->ltd_tgt); + if (tgt->ltd_qos.ltq_svr != svr) continue; - /* Evenly space these OSTs across arrayspace */ - next = j * lqr->lqr_pool.op_count / oss->lsq_tgt_count; + /* Evenly space these tgts across arrayspace */ + next = j * lqr->lqr_pool.op_count / svr->lsq_tgt_count; while (lqr->lqr_pool.op_array[next] != LOV_QOS_EMPTY) next = (next + 1) % lqr->lqr_pool.op_count; @@ -287,15 +309,15 @@ static int lod_qos_calc_rr(struct lod_device *lod, struct lu_tgt_pool *src_pool, } lqr->lqr_dirty = 0; - up_write(&lod->lod_ost_descs.ltd_qos.lq_rw_sem); + up_write(<d->ltd_qos.lq_rw_sem); if (placed != real_count) { /* This should never happen */ - LCONSOLE_ERROR_MSG(0x14e, "Failed to place all OSTs in the " + LCONSOLE_ERROR_MSG(0x14e, "Failed to place all tgts in the " "round-robin list (%d of %d).\n", placed, real_count); for (i = 0; i < lqr->lqr_pool.op_count; i++) { - LCONSOLE(D_WARNING, "rr #%d ost idx=%d\n", i, + LCONSOLE(D_WARNING, "rr #%d tgt idx=%d\n", i, lqr->lqr_pool.op_array[i]); } lqr->lqr_dirty = 1; @@ -401,7 +423,7 @@ static int min_stripe_count(__u32 stripe_count, int flags) #define LOV_CREATE_RESEED_MIN 2000 /** - * Initialize temporary OST-in-use array. + * Initialize temporary tgt-in-use array. * * Allocate or extend the array used to mark targets already assigned to a new * striping so they are not used more than once. @@ -412,7 +434,7 @@ static int min_stripe_count(__u32 stripe_count, int flags) * \retval 0 on success * \retval -ENOMEM on error */ -static inline int lod_qos_ost_in_use_clear(const struct lu_env *env, +static inline int lod_qos_tgt_in_use_clear(const struct lu_env *env, __u32 stripes) { struct lod_thread_info *info = lod_env_info(env); @@ -431,43 +453,44 @@ static inline int lod_qos_ost_in_use_clear(const struct lu_env *env, * Remember a target in the array of used targets. * * Mark the given target as used for a new striping being created. The status - * of an OST in a striping can be checked with lod_qos_is_ost_used(). + * of an tgt in a striping can be checked with lod_qos_is_tgt_used(). * * \param[in] env execution environment for this thread * \param[in] idx index in the array - * \param[in] ost OST target index to mark as used + * \param[in] tgt_idx target index to mark as used */ -static inline void lod_qos_ost_in_use(const struct lu_env *env, - int idx, int ost) +static inline void lod_qos_tgt_in_use(const struct lu_env *env, + int idx, int tgt_idx) { struct lod_thread_info *info = lod_env_info(env); - int *osts = info->lti_ea_store; + int *tgts = info->lti_ea_store; LASSERT(info->lti_ea_store_size >= idx * sizeof(int)); - osts[idx] = ost; + tgts[idx] = tgt_idx; } /** - * Check is OST used in a striping. + * Check is tgt used in a striping. * - * Checks whether OST with the given index is marked as used in the temporary - * array (see lod_qos_ost_in_use()). + * Checks whether tgt with the given index is marked as used in the temporary + * array (see lod_qos_tgt_in_use()). * * \param[in] env execution environment for this thread - * \param[in] ost OST target index to check + * \param[in] tgt_idx target index to check * \param[in] stripes the number of items used in the array already * * \retval 0 not used * \retval 1 used */ -static int lod_qos_is_ost_used(const struct lu_env *env, int ost, __u32 stripes) +static int lod_qos_is_tgt_used(const struct lu_env *env, int tgt_idx, + __u32 stripes) { struct lod_thread_info *info = lod_env_info(env); - int *osts = info->lti_ea_store; + int *tgts = info->lti_ea_store; __u32 j; for (j = 0; j < stripes; j++) { - if (osts[j] == ost) + if (tgts[j] == tgt_idx) return 1; } return 0; @@ -580,8 +603,7 @@ static inline bool lod_should_avoid_ost(struct lod_object *lo, static int lod_check_and_reserve_ost(const struct lu_env *env, struct lod_object *lo, struct lod_layout_component *lod_comp, - struct obd_statfs *sfs, __u32 ost_idx, - __u32 speed, __u32 *s_idx, + __u32 ost_idx, __u32 speed, __u32 *s_idx, struct dt_object **stripe, __u32 *ost_indices, struct thandle *th, @@ -589,12 +611,14 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, { struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid; + struct lu_tgt_desc *ost = OST_TGT(lod, ost_idx); struct dt_object *o; __u32 stripe_idx = *s_idx; int rc; + ENTRY; - rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, ost_idx, sfs); + rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, ost); if (rc) RETURN(rc); @@ -602,7 +626,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * We expect number of precreated objects in f_ffree at * the first iteration, skip OSPs with no objects ready */ - if (sfs->os_fprecreated == 0 && speed == 0) { + if (ost->ltd_statfs.os_fprecreated == 0 && speed == 0) { QOS_DEBUG("#%d: precreation is empty\n", ost_idx); RETURN(rc); } @@ -610,7 +634,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, /* * try to use another OSP if this one is degraded */ - if (sfs->os_state & OS_STATE_DEGRADED && speed < 2) { + if (ost->ltd_statfs.os_state & OS_STATE_DEGRADED && speed < 2) { QOS_DEBUG("#%d: degraded\n", ost_idx); RETURN(rc); } @@ -630,13 +654,13 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * for the first and second time. */ if (speed < 2 && lod_should_avoid_ost(lo, lag, ost_idx)) { - QOS_DEBUG("iter %d: OST%d used by conflicting mirror " - "component\n", speed, ost_idx); + QOS_DEBUG("iter %d: OST%d used by conflicting mirror component\n", + speed, ost_idx); RETURN(rc); } /* do not put >1 objects on a single OST, except for overstriping */ - if (lod_qos_is_ost_used(env, ost_idx, stripe_idx)) { + if (lod_qos_is_tgt_used(env, ost_idx, stripe_idx)) { if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) *overstriped = true; else @@ -655,7 +679,7 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * We've successfully declared (reserved) an object */ lod_avoid_update(lo, lag); - lod_qos_ost_in_use(env, stripe_idx, ost_idx); + lod_qos_tgt_in_use(env, stripe_idx, ost_idx); stripe[stripe_idx] = o; ost_indices[stripe_idx] = ost_idx; OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LOV_CREATE_RACE, 2); @@ -691,13 +715,12 @@ static int lod_check_and_reserve_ost(const struct lu_env *env, * \retval -ENOSPC if not enough OSTs are found * \retval negative negated errno for other failures */ -static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, - struct dt_object **stripe, __u32 *ost_indices, - int flags, struct thandle *th, int comp_idx) +static int lod_ost_alloc_rr(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripe, __u32 *ost_indices, + int flags, struct thandle *th, int comp_idx) { struct lod_layout_component *lod_comp; struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs; struct pool_desc *pool = NULL; struct lu_tgt_pool *osts; struct lu_qos_rr *lqr; @@ -727,11 +750,11 @@ static int lod_alloc_rr(const struct lu_env *env, struct lod_object *lo, lqr = &(m->lod_ost_descs.ltd_qos.lq_rr); } - rc = lod_qos_calc_rr(m, osts, lqr); + rc = lod_qos_calc_rr(m, &m->lod_ost_descs, osts, lqr); if (rc) GOTO(out, rc); - rc = lod_qos_ost_in_use_clear(env, stripe_count); + rc = lod_qos_tgt_in_use_clear(env, stripe_count); if (rc) GOTO(out, rc); @@ -786,7 +809,7 @@ repeat_find: continue; spin_unlock(&lqr->lqr_alloc); - rc = lod_check_and_reserve_ost(env, lo, lod_comp, sfs, ost_idx, + rc = lod_check_and_reserve_ost(env, lo, lod_comp, ost_idx, speed, &stripe_idx, stripe, ost_indices, th, &overstriped); spin_lock(&lqr->lqr_alloc); @@ -835,6 +858,165 @@ out: } /** + * Allocate a striping using round-robin algorithm. + * + * Allocates a new striping using round-robin algorithm. The function refreshes + * all the internal structures (statfs cache, array of available remote MDTs + * sorted with regard to MDS, etc). The number of stripes required is taken from + * the object (must be prepared by the caller). The caller should ensure nobody + * else is trying to create a striping on the object in parallel. All the + * internal structures (like pools, etc) are protected and no additional locking + * is required. The function succeeds even if a single stripe is allocated. + * + * \param[in] env execution environment for this thread + * \param[in] lo LOD object + * \param[out] stripe striping created + * + * \retval positive stripe objects allocated, including the first stripe + * allocated outside + * \retval -ENOSPC if not enough MDTs are found + * \retval negative negated errno for other failures + */ +int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripe) +{ + struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + struct lu_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lu_tgt_pool *pool; + struct lu_qos_rr *lqr; + struct lu_tgt_desc *mdt; + struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; + struct lu_fid fid = { 0 }; + struct dt_object *dto; + unsigned int pool_idx; + unsigned int i; + u32 start_idx_temp; + u32 stripe_count = lo->ldo_dir_stripe_count; + u32 stripe_idx = 1; + u32 mdt_idx; + bool use_degraded = false; + int tgt_connecting = 0; + int rc; + + ENTRY; + + pool = <d->ltd_tgt_pool; + lqr = <d->ltd_qos.lq_rr; + rc = lod_qos_calc_rr(lod, ltd, pool, lqr); + if (rc) + RETURN(rc); + + rc = lod_qos_tgt_in_use_clear(env, stripe_count); + if (rc) + RETURN(rc); + + down_read(<d->ltd_qos.lq_rw_sem); + spin_lock(&lqr->lqr_alloc); + if (--lqr->lqr_start_count <= 0) { + lqr->lqr_start_idx = prandom_u32_max(pool->op_count); + lqr->lqr_start_count = + (LOV_CREATE_RESEED_MIN / max(pool->op_count, 1U) + + LOV_CREATE_RESEED_MULT) * max(pool->op_count, 1U); + } else if (stripe_count - 1 >= pool->op_count || + lqr->lqr_start_idx > pool->op_count) { + /* If we have allocated from all of the tgts, slowly + * precess the next start if the tgt/stripe count isn't + * already doing this for us. */ + lqr->lqr_start_idx %= pool->op_count; + if (stripe_count - 1 > 1 && + (pool->op_count % (stripe_count - 1)) != 1) + ++lqr->lqr_offset_idx; + } + start_idx_temp = lqr->lqr_start_idx; + +repeat_find: + QOS_DEBUG("want %d start_idx %d start_count %d offset %d active %d count %d\n", + stripe_count - 1, lqr->lqr_start_idx, lqr->lqr_start_count, + lqr->lqr_offset_idx, pool->op_count, pool->op_count); + + for (i = 0; i < pool->op_count && stripe_idx < stripe_count; i++) { + pool_idx = (lqr->lqr_start_idx + lqr->lqr_offset_idx) % + pool->op_count; + ++lqr->lqr_start_idx; + mdt_idx = lqr->lqr_pool.op_array[pool_idx]; + mdt = LTD_TGT(ltd, mdt_idx); + + QOS_DEBUG("#%d strt %d act %d strp %d ary %d idx %d\n", + i, lqr->lqr_start_idx, /* XXX: active*/ 0, + stripe_idx, pool_idx, mdt_idx); + + if (mdt_idx == LOV_QOS_EMPTY || + !cfs_bitmap_check(ltd->ltd_tgt_bitmap, mdt_idx)) + continue; + + /* do not put >1 objects on one MDT */ + if (lod_qos_is_tgt_used(env, mdt_idx, stripe_idx)) + continue; + + rc = lod_is_tgt_usable(ltd, mdt); + if (rc) { + if (mdt->ltd_connecting) + tgt_connecting = 1; + continue; + } + + /* try to use another OSP if this one is degraded */ + if (mdt->ltd_statfs.os_state & OS_STATE_DEGRADED && + !use_degraded) { + QOS_DEBUG("#%d: degraded\n", mdt_idx); + continue; + } + spin_unlock(&lqr->lqr_alloc); + + rc = obd_fid_alloc(env, mdt->ltd_exp, &fid, NULL); + if (rc) { + QOS_DEBUG("#%d: alloc FID failed: %dl\n", mdt_idx, rc); + spin_lock(&lqr->lqr_alloc); + continue; + } + + dto = dt_locate_at(env, mdt->ltd_tgt, &fid, + lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev, + &conf); + + spin_lock(&lqr->lqr_alloc); + if (IS_ERR(dto)) { + QOS_DEBUG("can't alloc stripe on #%u: %d\n", + mdt->ltd_index, (int) PTR_ERR(dto)); + + if (mdt->ltd_connecting) + tgt_connecting = 1; + continue; + } + + lod_qos_tgt_in_use(env, stripe_idx, mdt_idx); + stripe[stripe_idx] = dto; + stripe_idx++; + } + + if (!use_degraded && stripe_idx < stripe_count) { + /* Try again, allowing slower OSCs */ + use_degraded = true; + lqr->lqr_start_idx = start_idx_temp; + + tgt_connecting = 0; + goto repeat_find; + } + spin_unlock(&lqr->lqr_alloc); + up_read(<d->ltd_qos.lq_rw_sem); + + if (stripe_idx > 1) + /* at least one stripe is allocated */ + RETURN(stripe_idx); + + /* nobody provided us with a single object */ + if (tgt_connecting) + RETURN(-EINPROGRESS); + + RETURN(-ENOSPC); +} + +/** * Allocate a specific striping layout on a user defined set of OSTs. * * Allocates new striping using the OST index range provided by the data from @@ -865,7 +1047,6 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, { struct lod_layout_component *lod_comp; struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs; struct dt_object *o; unsigned int array_idx = 0; int stripe_count = 0; @@ -879,7 +1060,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, LASSERT(lod_comp->llc_ostlist.op_array); LASSERT(lod_comp->llc_ostlist.op_count); - rc = lod_qos_ost_in_use_clear(env, lod_comp->llc_stripe_count); + rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count); if (rc < 0) RETURN(rc); @@ -913,14 +1094,14 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, /* do not put >1 objects on a single OST, except for * overstriping */ - if (lod_qos_is_ost_used(env, ost_idx, stripe_count) && + if (lod_qos_is_tgt_used(env, ost_idx, stripe_count) && !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)) { rc = -EINVAL; break; } - rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, ost_idx, - sfs); + rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, + LTD_TGT(&m->lod_ost_descs, ost_idx)); if (rc < 0) /* this OSP doesn't feel well */ break; @@ -936,7 +1117,7 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, /* * We've successfully declared (reserved) an object */ - lod_qos_ost_in_use(env, stripe_count, ost_idx); + lod_qos_tgt_in_use(env, stripe_count, ost_idx); stripe[stripe_count] = o; ost_indices[stripe_count] = ost_idx; stripe_count++; @@ -971,14 +1152,15 @@ static int lod_alloc_ost_list(const struct lu_env *env, struct lod_object *lo, * \retval -EINVAL requested offset is invalid * \retval negative errno on failure */ -static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo, - struct dt_object **stripe, __u32 *ost_indices, - int flags, struct thandle *th, int comp_idx) +static int lod_ost_alloc_specific(const struct lu_env *env, + struct lod_object *lo, + struct dt_object **stripe, __u32 *ost_indices, + int flags, struct thandle *th, int comp_idx) { struct lod_layout_component *lod_comp; struct lod_device *m = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs; struct dt_object *o; + struct lu_tgt_desc *tgt; __u32 ost_idx; unsigned int i, array_idx, ost_count; int rc, stripe_num = 0; @@ -992,7 +1174,7 @@ static int lod_alloc_specific(const struct lu_env *env, struct lod_object *lo, LASSERT(lo->ldo_comp_cnt > comp_idx && lo->ldo_comp_entries != NULL); lod_comp = &lo->ldo_comp_entries[comp_idx]; - rc = lod_qos_ost_in_use_clear(env, lod_comp->llc_stripe_count); + rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count); if (rc) GOTO(out, rc); @@ -1044,7 +1226,7 @@ repeat_find: * do not put >1 objects on a single OST, except for * overstriping, where it is intended */ - if (lod_qos_is_ost_used(env, ost_idx, stripe_num)) { + if (lod_qos_is_tgt_used(env, ost_idx, stripe_num)) { if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) overstriped = true; else @@ -1058,14 +1240,15 @@ repeat_find: lod_comp_is_ost_used(env, lo, ost_idx)) continue; + tgt = LTD_TGT(&m->lod_ost_descs, ost_idx); + /* Drop slow OSCs if we can, but not for requested start idx. * * This means "if OSC is slow and it is not the requested * start OST, then it can be skipped, otherwise skip it only * if it is inactive/recovering/out-of-space." */ - rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, ost_idx, - sfs); + rc = lod_statfs_and_check(env, m, &m->lod_ost_descs, tgt); if (rc) { /* this OSP doesn't feel well */ continue; @@ -1076,7 +1259,7 @@ repeat_find: * iteration. Skip OSPs with no objects ready. Don't apply * this logic to OST specified with stripe_offset. */ - if (i != 0 && sfs->os_fprecreated == 0 && speed == 0) + if (i && !tgt->ltd_statfs.os_fprecreated && !speed) continue; o = lod_qos_declare_object_on(env, m, ost_idx, th); @@ -1089,7 +1272,7 @@ repeat_find: /* * We've successfully declared (reserved) an object */ - lod_qos_ost_in_use(env, stripe_num, ost_idx); + lod_qos_tgt_in_use(env, stripe_num, ost_idx); stripe[stripe_num] = o; ost_indices[stripe_num] = ost_idx; stripe_num++; @@ -1164,13 +1347,12 @@ out: * \retval -EINVAL requested OST index is invalid * \retval negative errno on failure */ -static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, - struct dt_object **stripe, __u32 *ost_indices, - int flags, struct thandle *th, int comp_idx) +static int lod_ost_alloc_qos(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripe, __u32 *ost_indices, + int flags, struct thandle *th, int comp_idx) { struct lod_layout_component *lod_comp; struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); - struct obd_statfs *sfs = &lod_env_info(env)->lti_osfs; struct lod_avoid_guide *lag = &lod_env_info(env)->lti_avoid; struct lod_tgt_desc *ost; struct dt_object *o; @@ -1223,7 +1405,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, if (rc) GOTO(out, rc); - rc = lod_qos_ost_in_use_clear(env, lod_comp->llc_stripe_count); + rc = lod_qos_tgt_in_use_clear(env, lod_comp->llc_stripe_count); if (rc) GOTO(out, rc); @@ -1236,18 +1418,18 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, ost = OST_TGT(lod, osts->op_array[i]); ost->ltd_qos.ltq_usable = 0; - rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, - osts->op_array[i], sfs); + rc = lod_statfs_and_check(env, lod, &lod->lod_ost_descs, ost); if (rc) { /* this OSP doesn't feel well */ continue; } - if (sfs->os_state & OS_STATE_DEGRADED) + if (ost->ltd_statfs.os_state & OS_STATE_DEGRADED) continue; /* Fail Check before osc_precreate() is called - so we can only 'fail' single OSC. */ + * so we can only 'fail' single OSC. + */ if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && osts->op_array[i] == 0) continue; @@ -1281,7 +1463,8 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, rand = lu_prandom_u64_max(total_weight); /* On average, this will hit larger-weighted OSTs more often. - * 0-weight OSTs will always get used last (only when rand=0) */ + * 0-weight OSTs will always get used last (only when rand=0) + */ for (i = 0; i < osts->op_count; i++) { __u32 idx = osts->op_array[i]; @@ -1311,7 +1494,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, !(lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING)) continue; - if (lod_qos_is_ost_used(env, idx, nfound)) { + if (lod_qos_is_tgt_used(env, idx, nfound)) { if (lod_comp->llc_pattern & LOV_PATTERN_OVERSTRIPING) overstriped = true; @@ -1327,7 +1510,7 @@ static int lod_alloc_qos(const struct lu_env *env, struct lod_object *lo, } lod_avoid_update(lo, lag); - lod_qos_ost_in_use(env, nfound, idx); + lod_qos_tgt_in_use(env, nfound, idx); stripe[nfound] = o; ost_indices[nfound] = idx; ltd_qos_update(&lod->lod_ost_descs, ost, &total_weight); @@ -1385,6 +1568,207 @@ out_nolock: } /** + * Allocate a striping using an algorithm with weights. + * + * The function allocates remote MDT objects to create a striping, the first + * object was already allocated on current MDT to ensure master object and + * the first object are on the same MDT. The algorithm used is based on weights + * (both free space and inodes), and it's trying to ensure the space/inodes are + * used evenly by MDTs and MDSs. The striping configuration (# of stripes, + * offset, pool) is taken from the object and is prepared by the caller. + * + * If prepared configuration can't be met due to too few MDTs, then allocation + * fails. + * + * No concurrent allocation is allowed on the object and this must be ensured + * by the caller. All the internal structures are protected by the function. + * + * The algorithm has two steps: find available MDTs and calculate their + * weights, then select the MDTs with their weights used as the probability. + * An MDT with a higher weight is proportionately more likely to be selected + * than one with a lower weight. + * + * \param[in] env execution environment for this thread + * \param[in] lo LOD object + * \param[out] stripes striping created + * + * \retval positive stripes allocated, and it should be equal to + * lo->ldo_dir_stripe_count + * \retval -EAGAIN not enough tgts are found for specified stripe count + * \retval -EINVAL requested MDT index is invalid + * \retval negative errno on failure + */ +int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo, + struct dt_object **stripes) +{ + struct lod_device *lod = lu2lod_dev(lo->ldo_obj.do_lu.lo_dev); + struct lu_tgt_descs *ltd = &lod->lod_mdt_descs; + struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; + struct lu_fid fid = { 0 }; + const struct lu_tgt_pool *pool; + struct lu_tgt_desc *mdt; + struct dt_object *dto; + u64 total_weight = 0; + u32 stripe_count = lo->ldo_dir_stripe_count; + unsigned int nfound; + unsigned int good_mdts; + unsigned int i; + int rc = 0; + + ENTRY; + + if (stripe_count == 1) + RETURN(1); + + pool = <d->ltd_tgt_pool; + + /* Detect -EAGAIN early, before expensive lock is taken. */ + if (!ltd_qos_is_usable(ltd)) + RETURN(-EAGAIN); + + /* Do actual allocation, use write lock here. */ + down_write(<d->ltd_qos.lq_rw_sem); + + /* + * Check again, while we were sleeping on @lq_rw_sem things could + * change. + */ + if (!ltd_qos_is_usable(ltd)) + GOTO(unlock, rc = -EAGAIN); + + rc = ltd_qos_penalties_calc(ltd); + if (rc) + GOTO(unlock, rc); + + rc = lod_qos_tgt_in_use_clear(env, stripe_count); + if (rc) + GOTO(unlock, rc); + + good_mdts = 0; + /* Find all the tgts that are valid stripe candidates */ + for (i = 0; i < pool->op_count; i++) { + if (!cfs_bitmap_check(ltd->ltd_tgt_bitmap, pool->op_array[i])) + continue; + + mdt = LTD_TGT(ltd, pool->op_array[i]); + mdt->ltd_qos.ltq_usable = 0; + + rc = lod_is_tgt_usable(ltd, mdt); + if (rc) + continue; + + if (mdt->ltd_statfs.os_state & OS_STATE_DEGRADED) + continue; + + mdt->ltd_qos.ltq_usable = 1; + lu_tgt_qos_weight_calc(mdt); + total_weight += mdt->ltd_qos.ltq_weight; + + good_mdts++; + } + + QOS_DEBUG("found %d good tgts\n", good_mdts); + + if (good_mdts < stripe_count - 1) + GOTO(unlock, rc = -EAGAIN); + + /* Find enough tgts with weighted random allocation. */ + nfound = 1; + while (nfound < stripe_count) { + u64 rand, cur_weight; + + cur_weight = 0; + rc = -ENOSPC; + + rand = lu_prandom_u64_max(total_weight); + + /* On average, this will hit larger-weighted tgts more often. + * 0-weight tgts will always get used last (only when rand=0) */ + for (i = 0; i < pool->op_count; i++) { + __u32 idx = pool->op_array[i]; + int rc2; + + mdt = LTD_TGT(ltd, idx); + + if (!mdt->ltd_qos.ltq_usable) + continue; + + cur_weight += mdt->ltd_qos.ltq_weight; + + QOS_DEBUG("idx=%d nfound=%d cur_weight=%llu rand=%llu total_weight=%llu\n", + idx, nfound, cur_weight, rand, + total_weight); + + if (cur_weight < rand) + continue; + + QOS_DEBUG("stripe=%d to idx=%d\n", nfound, idx); + + if (lod_qos_is_tgt_used(env, idx, nfound)) + continue; + + rc2 = obd_fid_alloc(env, mdt->ltd_exp, &fid, NULL); + if (rc2) { + QOS_DEBUG("can't alloc FID on #%u: %d\n", + idx, rc2); + continue; + } + + conf.loc_flags = LOC_F_NEW; + dto = dt_locate_at(env, mdt->ltd_tgt, &fid, + lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev, + &conf); + if (IS_ERR(dto)) { + QOS_DEBUG("can't alloc stripe on #%u: %d\n", + idx, (int) PTR_ERR(dto)); + continue; + } + + lod_qos_tgt_in_use(env, nfound, idx); + stripes[nfound] = dto; + ltd_qos_update(ltd, mdt, &total_weight); + nfound++; + rc = 0; + break; + } + + /* no MDT found on this iteration, give up */ + if (rc) + break; + } + + if (unlikely(nfound != stripe_count)) { + /* + * when the decision to use weighted algorithm was made + * we had enough appropriate OSPs, but this state can + * change anytime (no space on MDT, broken connection, etc) + * so it's possible OSP won't be able to provide us with + * an object due to just changed state + */ + QOS_DEBUG("%s: wanted %d objects, found only %d\n", + lod2obd(lod)->obd_name, stripe_count, nfound); + for (i = 1; i < nfound; i++) { + LASSERT(stripes[i] != NULL); + dt_object_put(env, stripes[i]); + stripes[i] = NULL; + } + + /* makes sense to rebalance next time */ + ltd->ltd_qos.lq_dirty = 1; + ltd->ltd_qos.lq_same_space = 0; + + rc = -EAGAIN; + } else { + rc = nfound; + } + +unlock: + up_write(<d->ltd_qos.lq_rw_sem); + + RETURN(rc); +} + +/** * Check stripe count the caller can use. * * For new layouts (no initialized components), check the total size of the @@ -2041,7 +2425,7 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, * statfs and check OST targets now, since ld_active_tgt_count * could be changed if some OSTs are [de]activated manually. */ - lod_qos_statfs_update(env, d); + lod_qos_statfs_update(env, d, &d->lod_ost_descs); stripe_len = lod_get_stripe_count(d, lo, lod_comp->llc_stripe_count, lod_comp->llc_pattern & @@ -2079,14 +2463,16 @@ int lod_qos_prep_create(const struct lu_env *env, struct lod_object *lo, comp_idx); lod_collect_avoidance(lo, lag, comp_idx); - rc = lod_alloc_qos(env, lo, stripe, ost_indices, flag, - th, comp_idx); + rc = lod_ost_alloc_qos(env, lo, stripe, ost_indices, + flag, th, comp_idx); if (rc == -EAGAIN) - rc = lod_alloc_rr(env, lo, stripe, ost_indices, - flag, th, comp_idx); + rc = lod_ost_alloc_rr(env, lo, stripe, + ost_indices, flag, th, + comp_idx); } else { - rc = lod_alloc_specific(env, lo, stripe, ost_indices, - flag, th, comp_idx); + rc = lod_ost_alloc_specific(env, lo, stripe, + ost_indices, flag, th, + comp_idx); } put_ldts: lod_putref(d, &d->lod_ost_descs); diff --git a/lustre/lod/lproc_lod.c b/lustre/lod/lproc_lod.c index cccc6aa..8297ae3 100644 --- a/lustre/lod/lproc_lod.c +++ b/lustre/lod/lproc_lod.c @@ -46,60 +46,49 @@ #ifdef CONFIG_PROC_FS /** - * Show default stripe size. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed + * Show DoM default stripe size. */ -static int lod_dom_stripesize_seq_show(struct seq_file *m, void *v) +static ssize_t dom_stripesize_show(struct kobject *kobj, struct attribute *attr, + char *buf) { - struct obd_device *dev = m->private; - struct lod_device *lod; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct lod_device *lod = dt2lod_dev(dt); - LASSERT(dev != NULL); - lod = lu2lod_dev(dev->obd_lu_dev); - seq_printf(m, "%u\n", lod->lod_dom_max_stripesize); - return 0; + return snprintf(buf, PAGE_SIZE, "%u\n", lod->lod_dom_max_stripesize); } /** - * Set default stripe size. - * - * \param[in] file proc file - * \param[in] buffer string containing the maximum number of bytes stored in - * each object before moving to the next object in the - * layout (if any) - * \param[in] count @buffer length - * \param[in] off unused for single entry - * - * \retval @count on success - * \retval negative error code if failed + * Set DoM default stripe size. */ -static ssize_t -lod_dom_stripesize_seq_write(struct file *file, const char __user *buffer, - size_t count, loff_t *off) +static ssize_t dom_stripesize_store(struct kobject *kobj, + struct attribute *attr, const char *buffer, + size_t count) { - struct seq_file *m = file->private_data; - struct obd_device *dev = m->private; - struct lod_device *lod; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct lod_device *lod = dt2lod_dev(dt); + char tbuf[22] = ""; s64 val; int rc; - LASSERT(dev != NULL); - lod = lu2lod_dev(dev->obd_lu_dev); - rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1'); + if (count > (sizeof(tbuf) - 1)) + return -EINVAL; + + memcpy(tbuf, buffer, count); + + rc = lu_str_to_s64(tbuf, count, &val, '1'); if (rc) return rc; + if (val < 0) return -ERANGE; /* 1GB is the limit */ if (val > (1ULL << 30)) return -ERANGE; - else if (val > 0) { + + if (val > 0) { if (val < LOV_MIN_STRIPE_SIZE) { LCONSOLE_INFO("Increasing provided stripe size to " "a minimum value %u\n", @@ -117,57 +106,39 @@ lod_dom_stripesize_seq_write(struct file *file, const char __user *buffer, return count; } -LPROC_SEQ_FOPS(lod_dom_stripesize); -/** - * Show default stripe size. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed - */ -static int lod_stripesize_seq_show(struct seq_file *m, void *v) +LUSTRE_RW_ATTR(dom_stripesize); + +static ssize_t stripesize_show(struct kobject *kobj, struct attribute *attr, + char *buf) { - struct obd_device *dev = m->private; - struct lod_device *lod; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct lod_device *lod = dt2lod_dev(dt); - LASSERT(dev != NULL); - lod = lu2lod_dev(dev->obd_lu_dev); - seq_printf(m, "%llu\n", - lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_size); - return 0; + return snprintf(buf, PAGE_SIZE, "%llu\n", + lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_size); } -/** - * Set default stripe size. - * - * \param[in] file proc file - * \param[in] buffer string containing the maximum number of bytes stored in - * each object before moving to the next object in the - * layout (if any) - * \param[in] count @buffer length - * \param[in] off unused for single entry - * - * \retval @count on success - * \retval negative error code if failed - */ -static ssize_t -lod_stripesize_seq_write(struct file *file, const char __user *buffer, - size_t count, loff_t *off) +static ssize_t stripesize_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) { - struct seq_file *m = file->private_data; - struct obd_device *dev = m->private; - struct lod_device *lod; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct lod_device *lod = dt2lod_dev(dt); + char tbuf[22] = ""; s64 val; int rc; - LASSERT(dev != NULL); - lod = lu2lod_dev(dev->obd_lu_dev); - rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '1'); + if (count > (sizeof(tbuf) - 1)) + return -EINVAL; + + memcpy(tbuf, buffer, count); + + rc = lu_str_to_s64(tbuf, count, &val, '1'); if (rc) return rc; + if (val < 0) return -ERANGE; @@ -176,16 +147,11 @@ lod_stripesize_seq_write(struct file *file, const char __user *buffer, return count; } -LPROC_SEQ_FOPS(lod_stripesize); + +LUSTRE_RW_ATTR(stripesize); /** * Show default stripe offset. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed */ static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -194,7 +160,7 @@ static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - return sprintf(buf, "%lld\n", + return snprintf(buf, PAGE_SIZE, "%lld\n", lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_offset); } @@ -203,17 +169,10 @@ static ssize_t stripeoffset_show(struct kobject *kobj, struct attribute *attr, * * Usually contains -1 allowing Lustre to balance objects among OST * otherwise may cause severe OST imbalance. - * - * \param[in] file proc file - * \param[in] buffer string describing starting OST index for new files - * \param[in] count @buffer length - * \param[in] off unused for single entry - * - * \retval @count on success - * \retval negative error code if failed */ -static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) +static ssize_t stripeoffset_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); @@ -232,45 +191,47 @@ static ssize_t stripeoffset_store(struct kobject *kobj, struct attribute *attr, return count; } + LUSTRE_RW_ATTR(stripeoffset); /** * Show default striping pattern (LOV_PATTERN_*). - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed */ -static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr, - char *buf) +static ssize_t __stripetype_show(struct kobject *kobj, struct attribute *attr, + char *buf, bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; + + return snprintf(buf, PAGE_SIZE, "%u\n", ltd->ltd_lov_desc.ld_pattern); +} - return sprintf(buf, "%u\n", lod->lod_ost_descs.ltd_lov_desc.ld_pattern); +static ssize_t mdt_stripetype_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return __stripetype_show(kobj, attr, buf, true); +} + +static ssize_t stripetype_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return __stripetype_show(kobj, attr, buf, false); } /** * Set default striping pattern (a number, not a human-readable string). - * - * \param[in] file proc file - * \param[in] buffer string containing the default striping pattern for new - * files. This is an integer LOV_PATTERN_* value - * \param[in] count @buffer length - * \param[in] off unused for single entry - * - * \retval @count on success - * \retval negative error code if failed */ -static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) +static ssize_t __stripetype_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count, bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; u32 pattern; int rc; @@ -278,52 +239,73 @@ static ssize_t stripetype_store(struct kobject *kobj, struct attribute *attr, if (rc) return rc; - lod_fix_desc_pattern(&pattern); - lod->lod_ost_descs.ltd_lov_desc.ld_pattern = pattern; + if (is_mdt) + lod_fix_lmv_desc_pattern(&pattern); + else + lod_fix_desc_pattern(&pattern); + + ltd->ltd_lov_desc.ld_pattern = pattern; return count; } + +static ssize_t mdt_stripetype_store(struct kobject *kobj, + struct attribute *attr, const char *buffer, + size_t count) +{ + return __stripetype_store(kobj, attr, buffer, count, true); +} + +static ssize_t stripetype_store(struct kobject *kobj, + struct attribute *attr, const char *buffer, + size_t count) +{ + return __stripetype_store(kobj, attr, buffer, count, false); +} + +LUSTRE_RW_ATTR(mdt_stripetype); LUSTRE_RW_ATTR(stripetype); /** * Show default number of stripes. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success, - * \retval negative error code if failed */ -static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr, - char *buf) +static ssize_t __stripecount_show(struct kobject *kobj, struct attribute *attr, + char *buf, bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - struct lov_desc *desc = &lod->lod_ost_descs.ltd_lov_desc; + struct lov_desc *desc = is_mdt ? &lod->lod_mdt_descs.ltd_lov_desc : + &lod->lod_ost_descs.ltd_lov_desc; - return sprintf(buf, "%d\n", + return snprintf(buf, PAGE_SIZE, "%d\n", (s16)(desc->ld_default_stripe_count + 1) - 1); } +static ssize_t mdt_stripecount_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return __stripecount_show(kobj, attr, buf, true); +} + +static ssize_t stripecount_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return __stripecount_show(kobj, attr, buf, false); +} + /** * Set default number of stripes. - * - * \param[in] file proc file - * \param[in] buffer string containing the default number of stripes - * for new files - * \param[in] count @buffer length - * \param[in] off unused for single entry - * - * \retval @count on success - * \retval negative error code otherwise */ -static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) +static ssize_t __stripecount_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count, + bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; int stripe_count; int rc; @@ -335,61 +317,91 @@ static ssize_t stripecount_store(struct kobject *kobj, struct attribute *attr, return -ERANGE; lod_fix_desc_stripe_count(&stripe_count); - lod->lod_ost_descs.ltd_lov_desc.ld_default_stripe_count = stripe_count; + ltd->ltd_lov_desc.ld_default_stripe_count = stripe_count; return count; } + +static ssize_t mdt_stripecount_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + return __stripecount_store(kobj, attr, buffer, count, true); +} + +static ssize_t stripecount_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + return __stripecount_store(kobj, attr, buffer, count, false); +} + +LUSTRE_RW_ATTR(mdt_stripecount); LUSTRE_RW_ATTR(stripecount); /** * Show number of targets. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed */ -static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, - char *buf) +static ssize_t __numobd_show(struct kobject *kobj, struct attribute *attr, + char *buf, bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; + + return snprintf(buf, PAGE_SIZE, "%u\n", ltd->ltd_lov_desc.ld_tgt_count); +} + +static ssize_t mdt_numobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return __numobd_show(kobj, attr, buf, true); +} - return sprintf(buf, "%u\n", lod->lod_ost_count); +static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return __numobd_show(kobj, attr, buf, false); } + +LUSTRE_RO_ATTR(mdt_numobd); LUSTRE_RO_ATTR(numobd); /** * Show number of active targets. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed */ -static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, - char *buf) +static ssize_t __activeobd_show(struct kobject *kobj, struct attribute *attr, + char *buf, bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; + + return snprintf(buf, PAGE_SIZE, "%u\n", + ltd->ltd_lov_desc.ld_active_tgt_count); +} - return sprintf(buf, "%u\n", - lod->lod_ost_descs.ltd_lov_desc.ld_active_tgt_count); +static ssize_t mdt_activeobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return __activeobd_show(kobj, attr, buf, true); } + +static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return __activeobd_show(kobj, attr, buf, false); +} + +LUSTRE_RO_ATTR(mdt_activeobd); LUSTRE_RO_ATTR(activeobd); /** * Show UUID of LOD device. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed */ static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -398,7 +410,7 @@ static ssize_t desc_uuid_show(struct kobject *kobj, struct attribute *attr, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - return sprintf(buf, "%s\n", + return snprintf(buf, PAGE_SIZE, "%s\n", lod->lod_ost_descs.ltd_lov_desc.ld_uuid.uuid); } LUSTRE_RO_ATTR(desc_uuid); @@ -410,23 +422,31 @@ LUSTRE_RO_ATTR(desc_uuid); * of free space compared to performance. 0% means select OSTs equally * regardless of their free space, 100% means select OSTs only by their free * space even if it results in very imbalanced load on the OSTs. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed */ -static ssize_t qos_prio_free_show(struct kobject *kobj, struct attribute *attr, - char *buf) +static ssize_t __qos_prio_free_show(struct kobject *kobj, + struct attribute *attr, char *buf, + bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; + + return snprintf(buf, PAGE_SIZE, "%d%%\n", + (ltd->ltd_qos.lq_prio_free * 100 + 255) >> 8); +} - return sprintf(buf, "%d%%\n", - (lod->lod_ost_descs.ltd_qos.lq_prio_free * 100 + 255) >> - 8); +static ssize_t mdt_qos_prio_free_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return __qos_prio_free_show(kobj, attr, buf, true); +} + +static ssize_t qos_prio_free_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return __qos_prio_free_show(kobj, attr, buf, false); } /** @@ -436,21 +456,17 @@ static ssize_t qos_prio_free_show(struct kobject *kobj, struct attribute *attr, * are space imbalanced. See lod_qos_priofree_seq_show() for description of * this parameter. See lod_qos_thresholdrr_seq_write() and lq_threshold_rr to * determine what constitutes "space imbalanced" OSTs. - * - * \param[in] file proc file - * \param[in] buffer string which contains the free space priority (0-100) - * \param[in] count @buffer length - * \param[in] off unused for single entry - * - * \retval @count on success - * \retval negative error code if failed */ -static ssize_t qos_prio_free_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) +static ssize_t __qos_prio_free_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count, + bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; unsigned int val; int rc; @@ -460,34 +476,56 @@ static ssize_t qos_prio_free_store(struct kobject *kobj, struct attribute *attr, if (val > 100) return -EINVAL; - lod->lod_ost_descs.ltd_qos.lq_prio_free = (val << 8) / 100; - lod->lod_ost_descs.ltd_qos.lq_dirty = 1; - lod->lod_ost_descs.ltd_qos.lq_reset = 1; + ltd->ltd_qos.lq_prio_free = (val << 8) / 100; + ltd->ltd_qos.lq_dirty = 1; + ltd->ltd_qos.lq_reset = 1; return count; } + +static ssize_t mdt_qos_prio_free_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + return __qos_prio_free_store(kobj, attr, buffer, count, true); +} + +static ssize_t qos_prio_free_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + return __qos_prio_free_store(kobj, attr, buffer, count, false); +} + +LUSTRE_RW_ATTR(mdt_qos_prio_free); LUSTRE_RW_ATTR(qos_prio_free); /** * Show threshold for "same space on all OSTs" rule. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed */ -static int lod_qos_thresholdrr_seq_show(struct seq_file *m, void *v) +static ssize_t __qos_thresholdrr_show(struct kobject *kobj, + struct attribute *attr, char *buf, + bool is_mdt) { - struct obd_device *dev = m->private; - struct lod_device *lod; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; - LASSERT(dev != NULL); - lod = lu2lod_dev(dev->obd_lu_dev); - seq_printf(m, "%d%%\n", - (lod->lod_ost_descs.ltd_qos.lq_threshold_rr * 100 + 255) >> - 8); - return 0; + return snprintf(buf, PAGE_SIZE, "%d%%\n", + (ltd->ltd_qos.lq_threshold_rr * 100 + 255) >> 8); +} + +static ssize_t mdt_qos_thresholdrr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return __qos_thresholdrr_show(kobj, attr, buf, true); +} + +static ssize_t lod_qos_thresholdrr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return __qos_thresholdrr_show(kobj, attr, buf, false); } /** @@ -498,80 +536,89 @@ static int lod_qos_thresholdrr_seq_show(struct seq_file *m, void *v) * is exceeded, use the QoS allocator to select OSTs based on their available * space so that more full OSTs are chosen less often, otherwise use the * round-robin allocator for efficiency and performance. - - * \param[in] file proc file - * \param[in] buffer string containing percentage difference of free space - * \param[in] count @buffer length - * \param[in] off unused for single entry - * - * \retval @count on success - * \retval negative error code if failed */ -static ssize_t -lod_qos_thresholdrr_seq_write(struct file *file, const char __user *buffer, - size_t count, loff_t *off) +static ssize_t __qos_thresholdrr_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count, + bool is_mdt) { - struct seq_file *m = file->private_data; - struct obd_device *dev = m->private; - struct lod_device *lod; + struct dt_device *dt = container_of(kobj, struct dt_device, + dd_kobj); + struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; + unsigned int val; int rc; - __s64 val; - LASSERT(dev != NULL); - lod = lu2lod_dev(dev->obd_lu_dev); - - rc = lprocfs_str_with_units_to_s64(buffer, count, &val, '%'); + rc = kstrtouint(buffer, 0, &val); if (rc) return rc; - if (val > 100 || val < 0) + if (val > 100) return -EINVAL; - - lod->lod_ost_descs.ltd_qos.lq_threshold_rr = (val << 8) / 100; - lod->lod_ost_descs.ltd_qos.lq_dirty = 1; + ltd->ltd_qos.lq_threshold_rr = (val << 8) / 100; + ltd->ltd_qos.lq_dirty = 1; return count; } -LPROC_SEQ_FOPS(lod_qos_thresholdrr); + +static ssize_t mdt_qos_thresholdrr_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + return __qos_thresholdrr_store(kobj, attr, buffer, count, true); +} + +static ssize_t lod_qos_thresholdrr_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + return __qos_thresholdrr_store(kobj, attr, buffer, count, false); +} + +LUSTRE_RW_ATTR(mdt_qos_thresholdrr); +LUSTRE_RW_ATTR(lod_qos_thresholdrr); /** * Show expiration period used to refresh cached statfs data, which * is used to implement QoS/RR striping allocation algorithm. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed */ -static ssize_t qos_maxage_show(struct kobject *kobj, struct attribute *attr, - char *buf) +static ssize_t __qos_maxage_show(struct kobject *kobj, struct attribute *attr, + char *buf, bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; - return sprintf(buf, "%u Sec\n", - lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage); + return snprintf(buf, PAGE_SIZE, "%u Sec\n", + ltd->ltd_lov_desc.ld_qos_maxage); +} + +static ssize_t mdt_qos_maxage_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return __qos_maxage_show(kobj, attr, buf, true); +} + +static ssize_t qos_maxage_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + return __qos_maxage_show(kobj, attr, buf, true); } /** * Set expiration period used to refresh cached statfs data. - * - * \param[in] file proc file - * \param[in] buffer string contains maximum age of statfs data in seconds - * \param[in] count @buffer length - * \param[in] off unused for single entry - * - * \retval @count on success - * \retval negative error code if failed */ -static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) +static ssize_t __qos_maxage_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count, bool is_mdt) { struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; struct lustre_cfg_bufs bufs; struct lu_device *next; struct lustre_cfg *lcfg; @@ -586,7 +633,8 @@ static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr, if (val <= 0) return -EINVAL; - lod->lod_ost_descs.ltd_lov_desc.ld_qos_maxage = val; + + ltd->ltd_lov_desc.ld_qos_maxage = val; /* * propogate the value down to OSPs @@ -599,67 +647,117 @@ static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr, return -ENOMEM; lustre_cfg_init(lcfg, LCFG_PARAM, &bufs); - lod_getref(&lod->lod_ost_descs); - lod_foreach_ost(lod, tgt) { + lod_getref(ltd); + ltd_foreach_tgt(ltd, tgt) { next = &tgt->ltd_tgt->dd_lu_dev; rc = next->ld_ops->ldo_process_config(NULL, next, lcfg); if (rc) CERROR("can't set maxage on #%d: %d\n", tgt->ltd_index, rc); } - lod_putref(lod, &lod->lod_ost_descs); + lod_putref(lod, ltd); OBD_FREE(lcfg, lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)); return count; } + +static ssize_t mdt_qos_maxage_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + return __qos_maxage_store(kobj, attr, buffer, count, true); +} + +static ssize_t qos_maxage_store(struct kobject *kobj, struct attribute *attr, + const char *buffer, size_t count) +{ + return __qos_maxage_store(kobj, attr, buffer, count, false); +} + +LUSTRE_RW_ATTR(mdt_qos_maxage); LUSTRE_RW_ATTR(qos_maxage); -static void *lod_osts_seq_start(struct seq_file *p, loff_t *pos) +static void *lod_tgts_seq_start(struct seq_file *p, loff_t *pos, bool is_mdt) { struct obd_device *dev = p->private; - struct lod_device *lod; + struct lod_device *lod = lu2lod_dev(dev->obd_lu_dev); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; LASSERT(dev != NULL); - lod = lu2lod_dev(dev->obd_lu_dev); - lod_getref(&lod->lod_ost_descs); /* released in lod_osts_seq_stop */ - if (*pos >= lod->lod_ost_bitmap->size) + lod_getref(ltd); /* released in lod_tgts_seq_stop */ + if (*pos >= ltd->ltd_tgt_bitmap->size) return NULL; - *pos = find_next_bit(lod->lod_ost_bitmap->data, - lod->lod_ost_bitmap->size, *pos); - if (*pos < lod->lod_ost_bitmap->size) - return OST_TGT(lod,*pos); + *pos = find_next_bit(ltd->ltd_tgt_bitmap->data, + ltd->ltd_tgt_bitmap->size, *pos); + if (*pos < ltd->ltd_tgt_bitmap->size) + return LTD_TGT(ltd, *pos); else return NULL; } -static void lod_osts_seq_stop(struct seq_file *p, void *v) +static void *lod_mdts_seq_start(struct seq_file *p, loff_t *pos) +{ + return lod_tgts_seq_start(p, pos, true); +} + +static void *lod_osts_seq_start(struct seq_file *p, loff_t *pos) +{ + return lod_tgts_seq_start(p, pos, false); +} + +static void lod_tgts_seq_stop(struct seq_file *p, void *v, bool is_mdt) { struct obd_device *dev = p->private; - struct lod_device *lod; + struct lod_device *lod = lu2lod_dev(dev->obd_lu_dev); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; LASSERT(dev != NULL); - lod = lu2lod_dev(dev->obd_lu_dev); - lod_putref(lod, &lod->lod_ost_descs); + lod_putref(lod, ltd); } -static void *lod_osts_seq_next(struct seq_file *p, void *v, loff_t *pos) +static void lod_mdts_seq_stop(struct seq_file *p, void *v) +{ + lod_tgts_seq_stop(p, v, true); +} + +static void lod_osts_seq_stop(struct seq_file *p, void *v) +{ + lod_tgts_seq_stop(p, v, false); +} + +static void *lod_tgts_seq_next(struct seq_file *p, void *v, loff_t *pos, + bool is_mdt) { struct obd_device *dev = p->private; struct lod_device *lod = lu2lod_dev(dev->obd_lu_dev); + struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : + &lod->lod_ost_descs; - if (*pos >= lod->lod_ost_bitmap->size - 1) + if (*pos >= ltd->ltd_tgt_bitmap->size - 1) return NULL; - *pos = find_next_bit(lod->lod_ost_bitmap->data, - lod->lod_ost_bitmap->size, *pos + 1); - if (*pos < lod->lod_ost_bitmap->size) - return OST_TGT(lod,*pos); + *pos = find_next_bit(ltd->ltd_tgt_bitmap->data, + ltd->ltd_tgt_bitmap->size, *pos + 1); + if (*pos < ltd->ltd_tgt_bitmap->size) + return LTD_TGT(ltd, *pos); else return NULL; } +static void *lod_mdts_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + return lod_tgts_seq_next(p, v, pos, true); +} + +static void *lod_osts_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + return lod_tgts_seq_next(p, v, pos, false); +} + /** * Show active/inactive status for OST found by lod_osts_seq_next(). * @@ -669,45 +767,62 @@ static void *lod_osts_seq_next(struct seq_file *p, void *v, loff_t *pos) * \retval 0 on success * \retval negative error code if failed */ -static int lod_osts_seq_show(struct seq_file *p, void *v) +static int lod_tgts_seq_show(struct seq_file *p, void *v) { - struct obd_device *obd = p->private; - struct lu_tgt_desc *ost_desc = v; - struct lod_device *lod; - int idx, rc, active; - struct dt_device *next; - struct obd_statfs sfs; + struct obd_device *obd = p->private; + struct lu_tgt_desc *tgt = v; + struct dt_device *next; + int rc, active; LASSERT(obd->obd_lu_dev); - lod = lu2lod_dev(obd->obd_lu_dev); - idx = ost_desc->ltd_index; - next = OST_TGT(lod, idx)->ltd_tgt; - if (next == NULL) + next = tgt->ltd_tgt; + if (!next) return -EINVAL; /* XXX: should be non-NULL env, but it's very expensive */ active = 1; - rc = dt_statfs(NULL, next, &sfs); + rc = dt_statfs(NULL, next, &tgt->ltd_statfs); if (rc == -ENOTCONN) { active = 0; rc = 0; } else if (rc) return rc; - seq_printf(p, "%d: %s %sACTIVE\n", idx, - obd_uuid2str(&ost_desc->ltd_uuid), + seq_printf(p, "%d: %s %sACTIVE\n", tgt->ltd_index, + obd_uuid2str(&tgt->ltd_uuid), active ? "" : "IN"); return 0; } +static const struct seq_operations lod_mdts_sops = { + .start = lod_mdts_seq_start, + .stop = lod_mdts_seq_stop, + .next = lod_mdts_seq_next, + .show = lod_tgts_seq_show, +}; + static const struct seq_operations lod_osts_sops = { .start = lod_osts_seq_start, .stop = lod_osts_seq_stop, .next = lod_osts_seq_next, - .show = lod_osts_seq_show, + .show = lod_tgts_seq_show, }; +static int lod_mdts_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lod_mdts_sops); + if (rc) + return rc; + + seq = file->private_data; + seq->private = PDE_DATA(inode); + return 0; +} + static int lod_osts_seq_open(struct inode *inode, struct file *file) { struct seq_file *seq; @@ -724,12 +839,6 @@ static int lod_osts_seq_open(struct inode *inode, struct file *file) /** * Show whether special failout mode for testing is enabled or not. - * - * \param[in] m seq file - * \param[in] v unused for single entry - * - * \retval 0 on success - * \retval negative error code if failed */ static ssize_t lmv_failout_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -738,7 +847,7 @@ static ssize_t lmv_failout_show(struct kobject *kobj, struct attribute *attr, dd_kobj); struct lod_device *lod = dt2lod_dev(dt); - return sprintf(buf, "%d\n", lod->lod_lmv_failout ? 1 : 0); + return snprintf(buf, PAGE_SIZE, "%d\n", lod->lod_lmv_failout ? 1 : 0); } /** @@ -747,14 +856,6 @@ static ssize_t lmv_failout_show(struct kobject *kobj, struct attribute *attr, * This determines whether the LMV will try to continue processing a striped * directory even if it has a (partly) corrupted entry in the master directory, * or if it will abort upon finding a corrupted slave directory entry. - * - * \param[in] file proc file - * \param[in] buffer string: 0 or non-zero to disable or enable LMV failout - * \param[in] count @buffer length - * \param[in] off unused for single entry - * - * \retval @count on success - * \retval negative error code if failed */ static ssize_t lmv_failout_store(struct kobject *kobj, struct attribute *attr, const char *buffer, size_t count) @@ -776,15 +877,17 @@ static ssize_t lmv_failout_store(struct kobject *kobj, struct attribute *attr, LUSTRE_RW_ATTR(lmv_failout); static struct lprocfs_vars lprocfs_lod_obd_vars[] = { - { .name = "stripesize", - .fops = &lod_stripesize_fops }, - { .name = "qos_threshold_rr", - .fops = &lod_qos_thresholdrr_fops }, - { .name = "dom_stripesize", - .fops = &lod_dom_stripesize_fops }, { NULL } }; +static const struct file_operations lod_proc_mdt_fops = { + .owner = THIS_MODULE, + .open = lod_mdts_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; + static const struct file_operations lod_proc_target_fops = { .owner = THIS_MODULE, .open = lod_osts_seq_open, @@ -794,6 +897,8 @@ static const struct file_operations lod_proc_target_fops = { }; static struct attribute *lod_attrs[] = { + &lustre_attr_dom_stripesize.attr, + &lustre_attr_stripesize.attr, &lustre_attr_stripeoffset.attr, &lustre_attr_stripecount.attr, &lustre_attr_stripetype.attr, @@ -803,6 +908,14 @@ static struct attribute *lod_attrs[] = { &lustre_attr_numobd.attr, &lustre_attr_qos_maxage.attr, &lustre_attr_qos_prio_free.attr, + &lustre_attr_lod_qos_thresholdrr.attr, + &lustre_attr_mdt_stripecount.attr, + &lustre_attr_mdt_stripetype.attr, + &lustre_attr_mdt_activeobd.attr, + &lustre_attr_mdt_numobd.attr, + &lustre_attr_mdt_qos_maxage.attr, + &lustre_attr_mdt_qos_prio_free.attr, + &lustre_attr_mdt_qos_thresholdrr.attr, NULL, }; @@ -842,6 +955,14 @@ int lod_procfs_init(struct lod_device *lod) GOTO(out, rc); } + rc = lprocfs_seq_create(obd->obd_proc_entry, "mdt_obd", + 0444, &lod_proc_mdt_fops, obd); + if (rc) { + CWARN("%s: Error adding the target_obd file %d\n", + obd->obd_name, rc); + GOTO(out, rc); + } + rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", 0444, &lod_proc_target_fops, obd); if (rc) { @@ -932,4 +1053,3 @@ void lod_procfs_fini(struct lod_device *lod) } #endif /* CONFIG_PROC_FS */ - diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index 73c710e..81c148f 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -768,19 +768,6 @@ static int mdt_reint_setattr(struct mdt_thread_info *info, buf->lb_buf = lmu; buf->lb_len = ma->ma_lmv_size; - - if (le32_to_cpu(lmu->lum_hash_type) & - LMV_HASH_FLAG_SPACE) { - /* - * only allow setting "space" hash flag on - * plain directory. - */ - rc = mdt_object_striped(info, mo); - if (rc) - GOTO(out_put, - rc = (rc == 1) ? -EPERM : rc); - } - name = XATTR_NAME_DEFAULT_LMV; /* force client to update dir default layout */ lockpart |= MDS_INODELOCK_LOOKUP; diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 11fdc78..7030e8e 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -1989,34 +1989,31 @@ static int str_to_u64_parse(char *buffer, unsigned long count, * have a unit as the last character. The function handles overflow/underflow * of the signed integer. */ -static int str_to_s64_internal(const char __user *buffer, unsigned long count, - __s64 *val, __u64 def_mult, bool allow_units) +int lu_str_to_s64(char *buffer, unsigned long count, __s64 *val, char defunit) { - char kernbuf[22]; + __u64 mult = 1; __u64 tmp; unsigned int offset = 0; int signed sign = 1; __u64 max = LLONG_MAX; int rc = 0; - if (count > (sizeof(kernbuf) - 1)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - - kernbuf[count] = '\0'; + if (defunit != '1') { + rc = get_mult(defunit, &mult); + if (rc) + return rc; + } /* keep track of our sign */ - if (*kernbuf == '-') { + if (*buffer == '-') { sign = -1; offset++; /* equivalent to max = -LLONG_MIN, avoids overflow */ max++; } - rc = str_to_u64_parse(kernbuf + offset, count - offset, - &tmp, def_mult, allow_units); + rc = str_to_u64_parse(buffer + offset, count - offset, + &tmp, mult, true); if (rc) return rc; @@ -2028,6 +2025,7 @@ static int str_to_s64_internal(const char __user *buffer, unsigned long count, return 0; } +EXPORT_SYMBOL(lu_str_to_s64); /* identical to s64 version, but does not handle overflow */ static int str_to_u64_internal(const char __user *buffer, unsigned long count, @@ -2072,16 +2070,17 @@ static int str_to_u64_internal(const char __user *buffer, unsigned long count, int lprocfs_str_with_units_to_s64(const char __user *buffer, unsigned long count, __s64 *val, char defunit) { - __u64 mult = 1; - int rc; + char kernbuf[22]; - if (defunit != '1') { - rc = get_mult(defunit, &mult); - if (rc) - return rc; - } + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; - return str_to_s64_internal(buffer, count, val, mult, true); + return lu_str_to_s64(kernbuf, count, val, defunit); } EXPORT_SYMBOL(lprocfs_str_with_units_to_s64); diff --git a/lustre/obdclass/lu_tgt_descs.c b/lustre/obdclass/lu_tgt_descs.c index c7d8bbe..5ffe4ea 100644 --- a/lustre/obdclass/lu_tgt_descs.c +++ b/lustre/obdclass/lu_tgt_descs.c @@ -110,10 +110,6 @@ int lu_qos_add_tgt(struct lu_qos *qos, struct lu_tgt_desc *tgt) ENTRY; - /* tgt not connected, this function will be called again later */ - if (!exp) - RETURN(0); - down_write(&qos->lq_rw_sem); /* * a bit hacky approach to learn NID of corresponding connection @@ -531,7 +527,7 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd) * per-tgt penalty is * prio * bavail * iavail / (num_tgt - 1) / 2 */ - tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia; + tgt->ltd_qos.ltq_penalty_per_obj = prio_wide * ba * ia >> 8; do_div(tgt->ltd_qos.ltq_penalty_per_obj, num_active); tgt->ltd_qos.ltq_penalty_per_obj >>= 1; @@ -565,8 +561,9 @@ int ltd_qos_penalties_calc(struct lu_tgt_descs *ltd) list_for_each_entry(svr, &qos->lq_svr_list, lsq_svr_list) { ba = svr->lsq_bavail; ia = svr->lsq_iavail; - svr->lsq_penalty_per_obj = prio_wide * ba * ia; - do_div(ba, svr->lsq_tgt_count * num_active); + svr->lsq_penalty_per_obj = prio_wide * ba * ia >> 8; + do_div(svr->lsq_penalty_per_obj, + svr->lsq_tgt_count * num_active); svr->lsq_penalty_per_obj >>= 1; age = (now - svr->lsq_used) >> 3; @@ -661,6 +658,7 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, if (!tgt->ltd_active) continue; + ltq = &tgt->ltd_qos; if (ltq->ltq_penalty < ltq->ltq_penalty_per_obj) ltq->ltq_penalty = 0; else @@ -672,9 +670,10 @@ int ltd_qos_update(struct lu_tgt_descs *ltd, struct lu_tgt_desc *tgt, if (ltq->ltq_usable) *total_wt += ltq->ltq_weight; - CDEBUG(D_OTHER, "recalc tgt %d usable=%d avail=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n", + CDEBUG(D_OTHER, "recalc tgt %d usable=%d bavail=%llu ffree=%llu tgtppo=%llu tgtp=%llu svrppo=%llu svrp=%llu wt=%llu\n", tgt->ltd_index, ltq->ltq_usable, - tgt_statfs_bavail(tgt) >> 10, + tgt_statfs_bavail(tgt) >> 16, + tgt_statfs_iavail(tgt) >> 8, ltq->ltq_penalty_per_obj >> 10, ltq->ltq_penalty >> 10, ltq->ltq_svr->lsq_penalty_per_obj >> 10, diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index b6fa9ae..3056a0b 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -1861,7 +1861,6 @@ void lustre_assert_wire_constants(void) CLASSERT(LMV_MAGIC_V1 == 0x0CD20CD0); CLASSERT(LMV_MAGIC_STRIPE == 0x0CD40CD0); CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff); - CLASSERT(LMV_HASH_FLAG_SPACE == 0x08000000); CLASSERT(LMV_HASH_FLAG_LOST_LMV == 0x10000000); CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000); CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 0484594..62ff94f 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -20750,87 +20750,86 @@ test_412() { } run_test 412 "mkdir on specific MDTs" -test_413a() { - [ $MDSCOUNT -lt 2 ] && - skip "We need at least 2 MDTs for this test" - - if [ $(lustre_version_code mds1) -lt $(version_code 2.10.55) ]; then - skip "Need server version at least 2.10.55" - fi - - mkdir $DIR/$tdir || error "mkdir failed" - - # find MDT that is the most full - local max=$($LFS df | grep MDT | - awk 'BEGIN { a=0 } - { sub("%", "", $5) - if (0+$5 >= a) - { - a = $5 - b = $6 - } - } - END { split(b, c, ":") - sub("]", "", c[2]) - print c[2] - }') - - for i in $(seq $((MDSCOUNT - 1))); do - $LFS mkdir -c $i $DIR/$tdir/d$i || - error "mkdir d$i failed" - $LFS getdirstripe $DIR/$tdir/d$i - local stripe_index=$($LFS getdirstripe -i $DIR/$tdir/d$i) - [ $stripe_index -ne $max ] || - error "don't expect $max" - done -} -run_test 413a "mkdir on less full MDTs" - -test_413b() { - [ $MDSCOUNT -lt 2 ] && - skip "We need at least 2 MDTs for this test" - - [ $MDS1_VERSION -lt $(version_code 2.12.52) ] && - skip "Need server version at least 2.12.52" - - mkdir $DIR/$tdir || error "mkdir failed" - $LFS setdirstripe -D -i -1 -H space $DIR/$tdir || - error "setdirstripe failed" +test_qos_mkdir() { + local mkdir_cmd=$1 + local stripe_count=$2 + local mdts=$(comma_list $(mdts_nodes)) - local qos_prio_free - local qos_threshold_rr + local testdir + local lmv_qos_prio_free + local lmv_qos_threshold_rr + local lmv_qos_maxage + local lod_qos_prio_free + local lod_qos_threshold_rr + local lod_qos_maxage local count + local i - qos_prio_free=$($LCTL get_param -n lmv.*.qos_prio_free | head -n1) - qos_prio_free=${qos_prio_free%%%} - qos_threshold_rr=$($LCTL get_param -n lmv.*.qos_threshold_rr | head -n1) - qos_threshold_rr=${qos_threshold_rr%%%} - qos_maxage=$($LCTL get_param -n lmv.*.qos_maxage) - - stack_trap "$LCTL set_param lmv.*.qos_prio_free=$qos_prio_free" EXIT - stack_trap "$LCTL set_param lmv.*.qos_threshold_rr=$qos_threshold_rr" \ + lmv_qos_prio_free=$($LCTL get_param -n lmv.*.qos_prio_free | head -n1) + lmv_qos_prio_free=${lmv_qos_prio_free%%%} + lmv_qos_threshold_rr=$($LCTL get_param -n lmv.*.qos_threshold_rr | + head -n1) + lmv_qos_threshold_rr=${lmv_qos_threshold_rr%%%} + lmv_qos_maxage=$($LCTL get_param -n lmv.*.qos_maxage) + stack_trap "$LCTL set_param \ + lmv.*.qos_prio_free=$lmv_qos_prio_free > /dev/null" EXIT + stack_trap "$LCTL set_param \ + lmv.*.qos_threshold_rr=$lmv_qos_threshold_rr > /dev/null" EXIT + stack_trap "$LCTL set_param \ + lmv.*.qos_maxage=$lmv_qos_maxage > /dev/null" EXIT + + lod_qos_prio_free=$(do_facet mds1 $LCTL get_param -n \ + lod.lustre-MDT0000-mdtlov.mdt_qos_prio_free | head -n1) + lod_qos_prio_free=${lod_qos_prio_free%%%} + lod_qos_threshold_rr=$(do_facet mds1 $LCTL get_param -n \ + lod.lustre-MDT0000-mdtlov.mdt_qos_thresholdrr | head -n1) + lod_qos_threshold_rr=${lod_qos_threshold_rr%%%} + lod_qos_maxage=$(do_facet mds1 $LCTL get_param -n \ + lod.lustre-MDT0000-mdtlov.qos_maxage | awk '{ print $1 }') + stack_trap "do_nodes $mdts $LCTL set_param \ + lod.*.mdt_qos_prio_free=$lod_qos_prio_free > /dev/null" EXIT + stack_trap "do_nodes $mdts $LCTL set_param \ + lod.*.mdt_qos_thresholdrr=$lod_qos_threshold_rr > /dev/null" \ EXIT - stack_trap "$LCTL set_param lmv.*.qos_maxage=$qos_maxage" EXIT + stack_trap "do_nodes $mdts $LCTL set_param \ + lod.*.mdt_qos_maxage=$lod_qos_maxage > /dev/null" EXIT + + echo + echo "Mkdir (stripe_count $stripe_count) roundrobin:" - echo "mkdir with roundrobin" + $LCTL set_param lmv.*.qos_threshold_rr=100 > /dev/null + do_nodes $mdts $LCTL set_param lod.*.mdt_qos_thresholdrr=100 > /dev/null + + testdir=$DIR/$tdir-s$stripe_count/rr - $LCTL set_param lmv.*.qos_threshold_rr=100 for i in $(seq $((100 * MDSCOUNT))); do - mkdir $DIR/$tdir/subdir$i || error "mkdir subdir$i failed" + eval $mkdir_cmd $testdir/subdir$i || + error "$mkdir_cmd subdir$i failed" done + for i in $(seq $MDSCOUNT); do - count=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$((i - 1))$ | - wc -w) + count=$($LFS getdirstripe -i $testdir/* | + grep ^$((i - 1))$ | wc -l) echo "$count directories created on MDT$((i - 1))" [ $count -eq 100 ] || error "subdirs are not evenly distributed" + + if [ $stripe_count -gt 1 ]; then + count=$($LFS getdirstripe $testdir/* | + grep -P "^\s+$((i - 1))\t" | wc -l) + echo "$count stripes created on MDT$((i - 1))" + # deviation should < 5% of average + [ $count -lt $((95 * stripe_count)) ] || + [ $count -gt $((105 * stripe_count)) ] && + error "stripes are not evenly distributed" + fi done - rm -rf $DIR/$tdir/* + $LCTL set_param lmv.*.qos_threshold_rr=$lmv_qos_threshold_rr > /dev/null + do_nodes $mdts $LCTL set_param \ + lod.*.mdt_qos_thresholdrr=$lod_qos_threshold_rr > /dev/null - $LCTL set_param lmv.*.qos_threshold_rr=$qos_threshold_rr - # Shorten statfs result age, so that it can be updated in time - $LCTL set_param lmv.*.qos_maxage=1 - sleep_maxage + echo + echo "Check for uneven MDTs: " local ffree local bavail @@ -20867,9 +20866,8 @@ test_413b() { # Check if we need to generate uneven MDTs local threshold=50 - local diff=$(((max - min ) * 100 / min)) + local diff=$(((max - min) * 100 / min)) local value="$(generate_string 1024)" - local i while [ $diff -lt $threshold ]; do # generate uneven MDTs, create till $threshold% diff @@ -20884,11 +20882,11 @@ test_413b() { error "mkdir $tdir-MDT$min_index failed" for i in $(seq $count); do $OPENFILE -f O_CREAT:O_LOV_DELAY_CREATE \ - $DIR/$tdir-MDT$min_index/f$i > /dev/null || - error "create f$i failed" + $DIR/$tdir-MDT$min_index/f$j_$i > /dev/null || + error "create f$j_$i failed" setfattr -n user.413b -v $value \ - $DIR/$tdir-MDT$min_index/f$i || - error "setfattr f$i failed" + $DIR/$tdir-MDT$min_index/f$j_$i || + error "setfattr f$j_$i failed" done ffree=($(lctl get_param -n mdc.*[mM][dD][cC]-*.filesfree)) @@ -20904,31 +20902,95 @@ test_413b() { echo "MDT blocks available: ${bavail[@]}" echo "weight diff=$diff%" - echo "mkdir with balanced space usage" - $LCTL set_param lmv.*.qos_prio_free=100 + echo + echo "Mkdir (stripe_count $stripe_count) with balanced space usage:" + + $LCTL set_param lmv.*.qos_prio_free=100 > /dev/null + do_nodes $mdts $LCTL set_param lod.*.mdt_qos_prio_free=100 > /dev/null + # decrease statfs age, so that it can be updated in time + $LCTL set_param lmv.*.qos_maxage=1 > /dev/null + do_nodes $mdts $LCTL set_param lod.*.mdt_qos_maxage=1 > /dev/null + + sleep 1 + + testdir=$DIR/$tdir-s$stripe_count/qos + for i in $(seq $((100 * MDSCOUNT))); do - mkdir $DIR/$tdir/subdir$i || error "mkdir subdir$i failed" + eval $mkdir_cmd $testdir/subdir$i || + error "$mkdir_cmd subdir$i failed" done for i in $(seq $MDSCOUNT); do - count=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$((i - 1))$ | - wc -w) + count=$($LFS getdirstripe -i $testdir/* | grep ^$((i - 1))$ | + wc -l) echo "$count directories created on MDT$((i - 1))" + + if [ $stripe_count -gt 1 ]; then + count=$($LFS getdirstripe $testdir/* | + grep -P "^\s+$((i - 1))\t" | wc -l) + echo "$count stripes created on MDT$((i - 1))" + fi done - max=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$max_index$ | wc -l) - min=$($LFS getdirstripe -i $DIR/$tdir/* | grep ^$min_index$ | wc -l) + max=$($LFS getdirstripe -i $testdir/* | grep ^$max_index$ | wc -l) + min=$($LFS getdirstripe -i $testdir/* | grep ^$min_index$ | wc -l) + # D-value should > 10% of averge [ $((max - min)) -lt 10 ] && error "subdirs shouldn't be evenly distributed" - which getfattr > /dev/null 2>&1 || skip_env "no getfattr command" + # ditto + if [ $stripe_count -gt 1 ]; then + max=$($LFS getdirstripe $testdir/* | + grep -P "^\s+$max_index\t" | wc -l) + min=$($LFS getdirstripe $testdir/* | + grep -P "^\s+$min_index\t" | wc -l) + [ $((max - min)) -le $((10 * stripe_count)) ] && + error "stripes shouldn't be evenly distributed"|| true + fi +} + +test_413a() { + [ $MDSCOUNT -lt 2 ] && + skip "We need at least 2 MDTs for this test" - $LFS setdirstripe -D -d $DIR/$tdir || error "setdirstripe -d failed" - getfattr -n trusted.dmv $DIR/$tdir && - error "default dir layout exists" || true + [ $MDS1_VERSION -lt $(version_code 2.12.52) ] && + skip "Need server version at least 2.12.52" + + local stripe_count + + for stripe_count in $(seq 1 $((MDSCOUNT - 1))); do + mkdir $DIR/$tdir-s$stripe_count || error "mkdir failed" + mkdir $DIR/$tdir-s$stripe_count/rr || error "mkdir failed" + mkdir $DIR/$tdir-s$stripe_count/qos || error "mkdir failed" + test_qos_mkdir "$LFS mkdir -c $stripe_count" $stripe_count + done +} +run_test 413a "QoS mkdir with 'lfs mkdir -i -1'" + +test_413b() { + [ $MDSCOUNT -lt 2 ] && + skip "We need at least 2 MDTs for this test" + + [ $MDS1_VERSION -lt $(version_code 2.12.52) ] && + skip "Need server version at least 2.12.52" + + local stripe_count + + for stripe_count in $(seq 1 $((MDSCOUNT - 1))); do + mkdir $DIR/$tdir-s$stripe_count || error "mkdir failed" + mkdir $DIR/$tdir-s$stripe_count/rr || error "mkdir failed" + mkdir $DIR/$tdir-s$stripe_count/qos || error "mkdir failed" + $LFS setdirstripe -D -c $stripe_count \ + $DIR/$tdir-s$stripe_count/rr || + error "setdirstripe failed" + $LFS setdirstripe -D -c $stripe_count \ + $DIR/$tdir-s$stripe_count/qos || + error "setdirstripe failed" + test_qos_mkdir "mkdir" $stripe_count + done } -run_test 413b "mkdir with balanced space usage" +run_test 413b "QoS mkdir under dir whose default LMV starting MDT offset is -1" test_414() { #define OBD_FAIL_PTLRPC_BULK_ATTACH 0x521 diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 6364a1c..079d2a0 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -710,9 +710,6 @@ static int check_hashtype(const char *hashtype) if (strcmp(hashtype, mdt_hash_name[i]) == 0) return i; - if (!strcmp(hashtype, LMV_HASH_NAME_SPACE)) - return LMV_HASH_TYPE_DEFAULT | LMV_HASH_FLAG_SPACE; - return 0; } @@ -5592,28 +5589,6 @@ static int mntdf(char *mntdir, char *fsname, char *pool, enum mntdf_flags flags, return rc; } -static int ll_statfs_data_comp(const void *sd1, const void *sd2) -{ - const struct obd_statfs *st1 = &((const struct ll_statfs_data *)sd1)-> - sd_st; - const struct obd_statfs *st2 = &((const struct ll_statfs_data *)sd2)-> - sd_st; - int r1 = obd_statfs_ratio(st1, false); - int r2 = obd_statfs_ratio(st2, false); - int64_t result = r1 - r2; - - /* if both space usage are above 90, compare free inodes */ - if (r1 > 90 && r2 > 90) - result = st2->os_ffree - st1->os_ffree; - - if (result < 0) - return -1; - else if (result == 0) - return 0; - else - return 1; -} - /* functions */ static int lfs_setdirstripe(int argc, char **argv) { @@ -5626,12 +5601,9 @@ static int lfs_setdirstripe(int argc, char **argv) char *mode_opt = NULL; bool default_stripe = false; bool delete = false; - bool auto_distributed = false; bool foreign_mode = false; mode_t mode = S_IRWXU | S_IRWXG | S_IRWXO; mode_t previous_mode = 0; - struct ll_statfs_buf *lsb = NULL; - char mntdir[PATH_MAX] = ""; char *xattr = NULL; __u32 type = LU_FOREIGN_TYPE_DAOS, flags = 0; struct option long_opts[] = { @@ -5892,21 +5864,6 @@ static int lfs_setdirstripe(int argc, char **argv) memcpy(param->lsp_tgts, mdts, sizeof(*mdts) * lsa.lsa_nr_tgts); } - if (!default_stripe && (lsa.lsa_pattern & LMV_HASH_FLAG_SPACE)) { - fprintf(stderr, "%s %s: can only specify -H space with -D\n", - progname, argv[0]); - free(param); - return CMD_HELP; - } - - if (param->lsp_stripe_offset != -1 && - lsa.lsa_pattern & LMV_HASH_FLAG_SPACE) { - fprintf(stderr, "%s %s: can only specify -H space with -i -1\n", - progname, argv[0]); - free(param); - return CMD_HELP; - } - dname = argv[optind]; do { if (default_stripe) { @@ -5918,100 +5875,6 @@ static int lfs_setdirstripe(int argc, char **argv) continue; } - /* - * if current \a dname isn't under the same \a mntdir as the - * last one, and the last one was auto-distributed, restore - * \a param. - */ - if (mntdir[0] != '\0' && - strncmp(dname, mntdir, strlen(mntdir)) && - auto_distributed) { - param->lsp_is_specific = false; - param->lsp_stripe_offset = -1; - auto_distributed = false; - } - - /* - * TODO: when MDT can allocate object with QoS (LU-9435), below - * code should be removed, instead we should let LMV to allocate - * the starting MDT object, and then let LOD allocate other MDT - * objects. - */ - if (!param->lsp_is_specific && param->lsp_stripe_offset == -1) { - char path[PATH_MAX] = ""; - - if (!lsb) { - lsb = malloc(sizeof(*lsb)); - if (!lsb) { - result = -ENOMEM; - break; - } - } - lsb->sb_count = 0; - - /* use mntdir for dirname() temporarily */ - strncpy(mntdir, dname, sizeof(mntdir) - 1); - if (!realpath(dirname(mntdir), path)) { - result = -errno; - fprintf(stderr, - "error: invalid path '%s': %s\n", - argv[optind], strerror(errno)); - break; - } - mntdir[0] = '\0'; - - result = llapi_search_mounts(path, 0, mntdir, NULL); - if (result < 0 || mntdir[0] == '\0') { - fprintf(stderr, - "No suitable Lustre mount found\n"); - break; - } - - result = mntdf(mntdir, NULL, NULL, 0, LL_STATFS_LMV, - lsb); - if (result < 0) - break; - - if (param->lsp_stripe_count > lsb->sb_count) { - fprintf(stderr, - "error: stripe count %d is too big\n", - param->lsp_stripe_count); - result = -ERANGE; - break; - } - - qsort(lsb->sb_buf, lsb->sb_count, - sizeof(struct ll_statfs_data), - ll_statfs_data_comp); - - auto_distributed = true; - } - - if (auto_distributed) { - int r; - int nr = MAX(param->lsp_stripe_count, - lsb->sb_count / 2); - - /* don't use server whose usage is above 90% */ - while (nr != param->lsp_stripe_count && - obd_statfs_ratio(&lsb->sb_buf[nr].sd_st, false) > - 90) - nr = MAX(param->lsp_stripe_count, nr / 2); - - /* get \a r between [0, nr) */ - r = rand() % nr; - - param->lsp_stripe_offset = lsb->sb_buf[r].sd_index; - if (param->lsp_stripe_count > 1) { - int i = 0; - - param->lsp_is_specific = true; - for (; i < param->lsp_stripe_count; i++) - param->lsp_tgts[(i + r) % nr] = - lsb->sb_buf[i].sd_index; - } - } - result = llapi_dir_create(dname, mode, param); if (result) fprintf(stderr, @@ -6022,7 +5885,6 @@ static int lfs_setdirstripe(int argc, char **argv) if (mode_opt != NULL) umask(previous_mode); - free(lsb); free(param); return result; } @@ -6095,7 +5957,7 @@ static int lfs_mv(int argc, char **argv) } } - if (lmu.lum_stripe_offset == -1) { + if (lmu.lum_stripe_offset == LMV_OFFSET_DEFAULT) { fprintf(stderr, "%s mv: MDT index must be specified\n", progname); return CMD_HELP; diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 848a72d..754536f 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -3109,8 +3109,6 @@ void lmv_dump_user_lmm(struct lmv_user_md *lum, char *pool_name, else llapi_printf(LLAPI_MSG_NORMAL, "%#x", type); - if (flags & LMV_HASH_FLAG_SPACE) - llapi_printf(LLAPI_MSG_NORMAL, ",space"); if (flags & LMV_HASH_FLAG_MIGRATION) llapi_printf(LLAPI_MSG_NORMAL, ",migrating"); if (flags & LMV_HASH_FLAG_BAD_TYPE) @@ -5209,7 +5207,7 @@ static int cb_getstripe(char *path, DIR *parent, DIR **dirp, void *data, lum->lum_magic = LMV_USER_MAGIC; lum->lum_stripe_count = 0; - lum->lum_stripe_offset = -1; + lum->lum_stripe_offset = LMV_OFFSET_DEFAULT; goto dump; } else if (param->fp_get_lmv) { struct lmv_user_md *lum = param->fp_lmv_md; diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 74e4a31..a4d6a4b 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -851,7 +851,6 @@ check_lmv_mds_md_v1(void) CHECK_CDEFINE(LMV_MAGIC_V1); CHECK_CDEFINE(LMV_MAGIC_STRIPE); CHECK_CDEFINE(LMV_HASH_TYPE_MASK); - CHECK_CDEFINE(LMV_HASH_FLAG_SPACE); CHECK_CDEFINE(LMV_HASH_FLAG_LOST_LMV); CHECK_CDEFINE(LMV_HASH_FLAG_BAD_TYPE); CHECK_CDEFINE(LMV_HASH_FLAG_MIGRATION); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 2f7dc2f..68b6691 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -1882,7 +1882,6 @@ void lustre_assert_wire_constants(void) CLASSERT(LMV_MAGIC_V1 == 0x0CD20CD0); CLASSERT(LMV_MAGIC_STRIPE == 0x0CD40CD0); CLASSERT(LMV_HASH_TYPE_MASK == 0x0000ffff); - CLASSERT(LMV_HASH_FLAG_SPACE == 0x08000000); CLASSERT(LMV_HASH_FLAG_LOST_LMV == 0x10000000); CLASSERT(LMV_HASH_FLAG_BAD_TYPE == 0x20000000); CLASSERT(LMV_HASH_FLAG_MIGRATION == 0x80000000); -- 1.8.3.1