From 81ac7c0c989dd862e2215a4635c77e5123289658 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Thu, 19 Jan 2023 15:05:38 -0500 Subject: [PATCH] LU-12273 lod: metadata overstriping This adds overstriping for MDTs, similar to overstriping for OSTs (added in LU-9846). This adds a new option to setdirstripe, -C, allowing creation of more than one stripe per MDT. It is also possible to place multiple stripes on the same MDT using specific striping with -m. This allows a single directory to more fully use the full capability of each MDT in the file system. Two limitations of note: 1. This requires > 1 MDT, otherwise the DNE subsystem is not initialized. 2. Due to recovery limitations, we allow a max of only 5 stripes per MDT. MDT overstriping increases mdtest-hard-write performance by up to 13%, mdtest-hard-stat by 93%, at the cost of a slight drop in mdtest-hard-read (7%), with no change in delete. 4 MDTs, 1 stripe/MDT: mdtest-hard-write 117.399467 kIOPS : time 339.496 seconds mdtest-hard-stat 727.020749 kIOPS : time 55.666 seconds mdtest-hard-read 245.556392 kIOPS : time 162.897 seconds mdtest-hard-delete 104.379111 kIOPS : time 382.710 seconds 4 MDTs, 4 stripes/MDTs: mdtest-hard-write 132.963290 kIOPS : time 309.093 seconds mdtest-hard-stat 1408.161148 kIOPS : time 30.107 seconds mdtest-hard-read 229.383910 kIOPS : time 179.576 seconds mdtest-hard-delete 103.284369 kIOPS : time 398.442 seconds Test-Parameters: testlist=sanity env=ONLY=300u serverversion=2.14.0 Signed-off-by: Patrick Farrell Signed-off-by: Qian Yingjin Signed-off-by: Lai Siyao Change-Id: I11556b223029820bd335e87c7bf073970e03468d Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/35034 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/doc/lfs-getdirstripe.1 | 3 +- lustre/doc/lfs-setdirstripe.1 | 8 +- lustre/include/lustre_net.h | 12 +- lustre/include/uapi/linux/lustre/lustre_user.h | 7 +- lustre/lod/lod_object.c | 80 +++++- lustre/lod/lod_qos.c | 134 +++++++--- lustre/mdt/mdt_handler.c | 2 +- lustre/mdt/mdt_recovery.c | 5 +- lustre/mdt/mdt_xattr.c | 7 +- lustre/ptlrpc/layout.c | 1 - lustre/ptlrpc/service.c | 7 +- lustre/tests/sanity.sh | 349 ++++++++++++++++++++++++- lustre/tests/test-framework.sh | 17 +- lustre/utils/lfs.c | 65 +++-- lustre/utils/liblustreapi.c | 5 +- 15 files changed, 602 insertions(+), 100 deletions(-) diff --git a/lustre/doc/lfs-getdirstripe.1 b/lustre/doc/lfs-getdirstripe.1 index ad86f6e..7f98a88 100644 --- a/lustre/doc/lfs-getdirstripe.1 +++ b/lustre/doc/lfs-getdirstripe.1 @@ -21,7 +21,8 @@ Show the default layout used when creating new subdirectories. Print usage message. .TP .BR \-H ", " \-\-mdt-hash -Only show the hash function being used for this directory. +Only show the hash function being used for this directory. Also shows hash +flags, such as overstriping (>1 stripe per MDT). .TP .BR --hex-idx Print MDT indexes in hexademical rather than decimal. diff --git a/lustre/doc/lfs-setdirstripe.1 b/lustre/doc/lfs-setdirstripe.1 index be8c653..80b766d 100644 --- a/lustre/doc/lfs-setdirstripe.1 +++ b/lustre/doc/lfs-setdirstripe.1 @@ -2,7 +2,7 @@ .SH NAME lfs setdirstripe, mkdir \- set striping pattern of a directory. .SH SYNOPSIS -.B lfs setdirstripe [\fR-cdDhHioTxX\fR] \fIDIR\fR... +.B lfs setdirstripe [\fR-cCdDhHioTxX\fR] \fIDIR\fR... .br .SH DESCRIPTION Create a striped directory with specified striping pattern. This @@ -25,6 +25,12 @@ Stripe the new directory over .I COUNT MDTs. .TP +.B -C\fR, \fB--overstripe-count \fISTRIPE_COUNT\fR +The number of stripes to create, creating > 1 stripe MDT if \fISTRIPE_COUNT\fR +exceeds the number of MDTs in the file system. \fB0 \fRmeans to use the +filesystem-wide default stripe count (default 1), and \fB-1 \fRmeans to stripe +over all available MDTs. Max of 5 stripes/MDT. +.TP .BR \-h ", " \-\-help Print usage message. .TP diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index bda0f5f..279a41b 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -635,8 +635,12 @@ struct ptlrpc_cb_id { void *cbid_arg; /* additional arg */ }; -/** Maximum number of locks to fit into reply state */ -#define RS_MAX_LOCKS 8 +/** Maximum number of locks to fit into reply state, migrating directory max + * stripe count is 2 * LMV_MAX_STRIPES_PER_MDT, plus source parent, target + * parent, source and target master object: + * 2 * LMV_MAX_STRIPES_PER_MDT + 4 + */ +#define RS_MAX_LOCKS 14 #define RS_DEBUG 0 /** @@ -705,8 +709,6 @@ struct ptlrpc_reply_state { /** Handles of locks awaiting client reply ACK */ struct lustre_handle rs_locks[RS_MAX_LOCKS]; - /** Lock modes of locks in \a rs_locks */ - enum ldlm_mode rs_modes[RS_MAX_LOCKS]; }; struct ptlrpc_thread; @@ -2229,7 +2231,7 @@ struct ptlrpc_service_conf { * @{ */ void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock, - int mode, bool no_ack); + bool no_ack); void ptlrpc_commit_replies(struct obd_export *exp); void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs); void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs); diff --git a/lustre/include/uapi/linux/lustre/lustre_user.h b/lustre/include/uapi/linux/lustre/lustre_user.h index 29c824e..ace28bb 100644 --- a/lustre/include/uapi/linux/lustre/lustre_user.h +++ b/lustre/include/uapi/linux/lustre/lustre_user.h @@ -1145,7 +1145,7 @@ static inline bool lmv_is_known_hash_type(__u32 type) #define LMV_HASH_FLAG_LAYOUT_CHANGE \ (LMV_HASH_FLAG_MIGRATION | LMV_HASH_FLAG_SPLIT | LMV_HASH_FLAG_MERGE) -#define LMV_HASH_FLAG_KNOWN 0xbe000000 +#define LMV_HASH_FLAG_KNOWN 0xbf000000 /* migration failure may leave hash type as * LMV_HASH_TYPE_UNKNOWN|LMV_HASH_FLAG_BAD_TYPE, which should be treated as @@ -1218,6 +1218,7 @@ extern struct lustre_foreign_type lu_foreign_types[]; * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) */ #define LMV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ +#define LMV_MAX_STRIPES_PER_MDT 5 /* (RS_MAX_LOCKS - 4) / 2 */ #define lmv_user_md lmv_user_md_v1 struct lmv_user_md_v1 { __u32 lum_magic; /* must be the first field */ @@ -1267,7 +1268,9 @@ enum { LMV_INHERIT_DEFAULT_PLAIN = LMV_INHERIT_UNLIMITED, /* not inherit any more */ LMV_INHERIT_END = 1, - /* for multiple stripes, the default lum_max_inherit is 3 */ + /* for overstriped dirs, the default limit is 1 level of inheritance */ + LMV_INHERIT_DEFAULT_OVERSTRIPED = 2, + /* for multiple stripes, the default limit is 2 levels of inheritance*/ LMV_INHERIT_DEFAULT_STRIPED = 3, /* max inherit depth */ LMV_INHERIT_MAX = 250, diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index cd07bed..5615bbb 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -2047,11 +2047,17 @@ static int lod_mdt_alloc_specific(const struct lu_env *env, bool already_allocated = false; __u32 k; - CDEBUG(D_INFO, "try idx %d, mdt cnt %u, allocated %u\n", - idx, lod->lod_remote_mdt_count + 1, stripe_idx); + CDEBUG(D_INFO, + "try idx %d, mdt cnt %u, allocated %u, specific %d count %hu offset %d hash %#X\n", + idx, lod->lod_remote_mdt_count + 1, stripe_idx, + is_specific, lo->ldo_dir_stripe_count, + (int)lo->ldo_dir_stripe_offset, + lo->ldo_dir_hash_type); if (likely(!is_specific && - !CFS_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE))) { + !CFS_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE) && + !(lo->ldo_dir_hash_type & + LMV_HASH_FLAG_OVERSTRIPED))) { /* check whether the idx already exists * in current allocated array */ for (k = 0; k < stripe_idx; k++) { @@ -2167,6 +2173,7 @@ static int lod_prep_md_striped_create(const struct lu_env *env, struct dt_object **stripes; struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; struct lu_fid fid = { 0 }; + int mdt_count = lod->lod_remote_mdt_count + 1; __u32 stripe_count; int i; int rc = 0; @@ -2178,6 +2185,17 @@ static int lod_prep_md_striped_create(const struct lu_env *env, le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC); stripe_count = lo->ldo_dir_stripe_count; + if (!(lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) && + stripe_count > mdt_count) + RETURN(-E2BIG); + + if ((lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) && + (stripe_count > mdt_count * LMV_MAX_STRIPES_PER_MDT || + /* a single MDT doesn't initialize the infrastructure for striped + * directories, so we just don't support overstriping in that case + */ + mdt_count == 1)) + RETURN(-E2BIG); OBD_ALLOC_PTR_ARRAY(stripes, stripe_count); if (!stripes) @@ -2208,7 +2226,23 @@ static int lod_prep_md_striped_create(const struct lu_env *env, GOTO(out, rc = -ENOMEM); if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) { + int stripes_per_mdt; + int mdt; + is_specific = true; + + /* Verify we do not exceed the stripes per MDT limit */ + for (mdt = 0; mdt < mdt_count + 1; mdt++) { + stripes_per_mdt = 0; + for (i = 0; i < stripe_count; i++) { + if (mdt == le32_to_cpu( + lum->lum_objects[i].lum_mds)) + stripes_per_mdt++; + } + if (stripes_per_mdt > LMV_MAX_STRIPES_PER_MDT) + GOTO(out_free, rc = -EINVAL); + } + for (i = 0; i < stripe_count; i++) idx_array[i] = le32_to_cpu(lum->lum_objects[i].lum_mds); @@ -2219,6 +2253,7 @@ static int lod_prep_md_striped_create(const struct lu_env *env, lu_site2seq(lod2lu_dev(lod)->ld_site)->ss_node_id; rc = lod_mdt_alloc_specific(env, lo, stripes, idx_array, is_specific); +out_free: OBD_FREE_PTR_ARRAY(idx_array, stripe_count); } @@ -5787,6 +5822,7 @@ static void lod_ah_init(const struct lu_env *env, if (S_ISDIR(child_mode)) { const struct lmv_user_md_v1 *lum1 = ah->dah_eadata; + int max_stripe_count; /* other default values are 0 */ lc->ldo_dir_stripe_offset = LMV_OFFSET_DEFAULT; @@ -5895,10 +5931,15 @@ static void lod_ah_init(const struct lu_env *env, d->lod_max_mdt_stripecount) lc->ldo_dir_stripe_count = d->lod_max_mdt_stripecount; - /* shrink the stripe_count to the avaible MDT count */ - if (lc->ldo_dir_stripe_count > d->lod_remote_mdt_count + 1 && + max_stripe_count = d->lod_remote_mdt_count + 1; + if (lc->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) + max_stripe_count = + max_stripe_count * LMV_MAX_STRIPES_PER_MDT; + + /* shrink the stripe_count to max stripe count */ + if (lc->ldo_dir_stripe_count > max_stripe_count && !CFS_FAIL_CHECK(OBD_FAIL_LARGE_STRIPE)) { - lc->ldo_dir_stripe_count = d->lod_remote_mdt_count + 1; + lc->ldo_dir_stripe_count = max_stripe_count; if (lc->ldo_dir_stripe_count == 1) lc->ldo_dir_stripe_count = 0; } @@ -5917,7 +5958,7 @@ static void lod_ah_init(const struct lu_env *env, lc->ldo_def_striping = lds; } - CDEBUG(D_INFO, "final dir stripe_count=%hu offset=%d hash=%u\n", + CDEBUG(D_INFO, "final dir stripe_count=%hu offset=%d hash=%x\n", lc->ldo_dir_stripe_count, (int)lc->ldo_dir_stripe_offset, lc->ldo_dir_hash_type); @@ -8970,6 +9011,7 @@ static int lod_dir_declare_layout_split(const struct lu_env *env, struct dt_object_format *dof = &info->lti_format; struct lmv_user_md_v1 *lum = mlc->mlc_spec->u.sp_ea.eadata; struct dt_object **stripes; + int mdt_count = lod->lod_remote_mdt_count + 1; u32 stripe_count; u32 saved_count; int i; @@ -8985,6 +9027,29 @@ static int lod_dir_declare_layout_split(const struct lu_env *env, if (stripe_count <= saved_count) RETURN(-EINVAL); + /* if the split target is overstriped, we need to put that flag in the + * current layout so it can allocate the larger number of stripes + * + * Note we need to pick up any hash *flags* which affect allocation + * *before* allocation, so they're used in allocating the directory, + * rather than after when we finalize directory setup (at the end of + * this function). + */ + if (le32_to_cpu(lum->lum_hash_type) & LMV_HASH_FLAG_OVERSTRIPED) + lo->ldo_dir_hash_type |= LMV_HASH_FLAG_OVERSTRIPED; + + if (!(lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) && + stripe_count > mdt_count) { + RETURN(-E2BIG); + } else if ((lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) && + (stripe_count > mdt_count * LMV_MAX_STRIPES_PER_MDT || + /* a single MDT doesn't initialize the infrastructure for striped + * directories, so we just don't support overstriping in that case + */ + mdt_count == 1)) { + RETURN(-E2BIG); + } + dof->dof_type = DFT_DIR; OBD_ALLOC(stripes, sizeof(*stripes) * stripe_count); @@ -8995,6 +9060,7 @@ static int lod_dir_declare_layout_split(const struct lu_env *env, stripes[i] = lo->ldo_stripe[i]; lod_qos_statfs_update(env, lod, &lod->lod_mdt_descs); + rc = lod_mdt_alloc_qos(env, lo, stripes, saved_count, stripe_count); if (rc == -EAGAIN) rc = lod_mdt_alloc_rr(env, lo, stripes, saved_count, diff --git a/lustre/lod/lod_qos.c b/lustre/lod/lod_qos.c index 8887d5f..c93cee8 100644 --- a/lustre/lod/lod_qos.c +++ b/lustre/lod/lod_qos.c @@ -920,15 +920,16 @@ int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo, struct lu_tgt_descs *ltd = &lod->lod_mdt_descs; struct lu_tgt_pool *pool; struct lu_qos_rr *lqr; - struct lu_tgt_desc *mdt; struct lu_object_conf conf = { .loc_flags = LOC_F_NEW }; struct lu_fid fid = { 0 }; struct dt_object *dto; unsigned int pool_idx; unsigned int i; u32 saved_idx = stripe_idx; + int stripes_per_mdt = 1; u32 mdt_idx; bool use_degraded = false; + bool overstriped = false; int tgt_connecting = 0; int rc; @@ -940,6 +941,14 @@ int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo, if (rc) RETURN(rc); + overstriped = lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED; + + if (stripe_count > lod->lod_remote_mdt_count + 1 && !overstriped) + RETURN(-E2BIG); + + if (lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) + stripes_per_mdt = stripe_count / (pool->op_count + 1); + rc = lod_qos_mdt_in_use_init(env, ltd, stripe_idx, stripe_count, pool, stripes); if (rc) @@ -956,7 +965,8 @@ int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo, } else if (atomic_read(&lqr->lqr_start_idx) >= pool->op_count) { /* If we have allocated from all of the tgts, slowly * precess the next start if the tgt/stripe count isn't - * already doing this for us. */ + * already doing this for us. + */ atomic_sub(pool->op_count, &lqr->lqr_start_idx); if (stripe_count - 1 > 1 && (pool->op_count % (stripe_count - 1)) != 1) @@ -965,65 +975,111 @@ int lod_mdt_alloc_rr(const struct lu_env *env, struct lod_object *lo, spin_unlock(&lqr->lqr_alloc); repeat_find: - CDEBUG(D_OTHER, "want=%d start_idx=%d start_count=%d offset=%d active=%d count=%d\n", + CDEBUG(D_OTHER, + "want=%d start_idx=%d start_count=%d offset=%d active=%d count=%d\n", stripe_count - 1, atomic_read(&lqr->lqr_start_idx), - lqr->lqr_start_count, lqr->lqr_offset_idx, pool->op_count, - pool->op_count); - - for (i = 0; i < pool->op_count && stripe_idx < stripe_count; i++) { + lqr->lqr_start_count, lqr->lqr_offset_idx, + /* if we're overstriped, the local MDT is available and is + * included in the count + */ + pool->op_count + overstriped, + lqr->lqr_pool.op_count + overstriped); + + for (i = 0; i < (pool->op_count + overstriped) * stripes_per_mdt && + stripe_idx < stripe_count; i++) { + struct lu_tgt_desc *mdt = NULL; + struct dt_device *mdt_tgt; + bool local_alloc = false; int idx; idx = atomic_inc_return(&lqr->lqr_start_idx); pool_idx = (idx + lqr->lqr_offset_idx) % - pool->op_count; - mdt_idx = lqr->lqr_pool.op_array[pool_idx]; - mdt = LTD_TGT(ltd, mdt_idx); + (pool->op_count + overstriped); + /* in the overstriped case, we must be able to allocate a stripe + * to the local MDT, ie, the one doing the allocation + */ + if (pool_idx == pool->op_count) { + LASSERT(overstriped); + /* because there is already a stripe on the local MDT, + * do not allocate from the local MDT until we've + * allocated at least as many stripes as we have MDTs + */ + if (stripe_idx < (pool->op_count + 1)) { + CDEBUG(D_OTHER, + "Skipping local alloc, not enough stripes yet\n"); + continue; + } + CDEBUG(D_OTHER, "Attempting to allocate locally\n"); + local_alloc = true; + mdt_tgt = lod->lod_child; + rc = lodname2mdt_index(lod2obd(lod)->obd_name, + &mdt_idx); + /* this parsing can't fail here because we're working + * with a known-good MDT + */ + LASSERT(!rc); + } else { + mdt_idx = lqr->lqr_pool.op_array[pool_idx]; + mdt = LTD_TGT(ltd, mdt_idx); + mdt_tgt = mdt->ltd_tgt; + } CDEBUG(D_OTHER, "#%d strt %d act %d strp %d ary %d idx %d\n", i, idx, /* XXX: active*/ 0, stripe_idx, pool_idx, mdt_idx); - if (mdt_idx == LOV_QOS_EMPTY || - !test_bit(mdt_idx, ltd->ltd_tgt_bitmap)) - continue; - - /* do not put >1 objects on one MDT */ - if (lod_qos_is_tgt_used(env, mdt_idx, stripe_idx)) - continue; - - if (mdt->ltd_discon) { - tgt_connecting = 1; + if (!local_alloc && (mdt_idx == LOV_QOS_EMPTY || + !test_bit(mdt_idx, ltd->ltd_tgt_bitmap))) { + CDEBUG(D_OTHER, "mdt_idx not found %d\n", mdt_idx); continue; } - if (lod_statfs_check(ltd, mdt)) - continue; + /* do not put >1 objects on one MDT, except for overstriping */ + if (!local_alloc) { + if (lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) { + CDEBUG(D_OTHER, "overstriped\n"); + } else if (lod_qos_is_tgt_used(env, mdt_idx, + stripe_idx)) { + CDEBUG(D_OTHER, "#%d: already used\n", mdt_idx); + continue; + } + } - if (mdt->ltd_statfs.os_state & OS_STATFS_NOCREATE) - continue; + /* we know the local MDT is usable */ + if (!local_alloc) { + if (mdt->ltd_discon) { + tgt_connecting = 1; + CDEBUG(D_OTHER, "#%d: unusable\n", mdt_idx); + continue; + } + if (lod_statfs_check(ltd, mdt)) + continue; + if (mdt->ltd_statfs.os_state & OS_STATFS_NOCREATE) + continue; + } /* try to use another OSP if this one is degraded */ - if (mdt->ltd_statfs.os_state & OS_STATFS_DEGRADED && - !use_degraded) { + if (!local_alloc && !use_degraded && + mdt->ltd_statfs.os_state & OS_STATFS_DEGRADED) { CDEBUG(D_OTHER, "#%d: degraded\n", mdt_idx); continue; } - rc = dt_fid_alloc(env, mdt->ltd_tgt, &fid, NULL, NULL); + rc = dt_fid_alloc(env, mdt_tgt, &fid, NULL, NULL); if (rc < 0) { CDEBUG(D_OTHER, "#%d: alloc FID failed: %dl\n", mdt_idx, rc); continue; } - dto = dt_locate_at(env, mdt->ltd_tgt, &fid, + dto = dt_locate_at(env, mdt_tgt, &fid, lo->ldo_obj.do_lu.lo_dev->ld_site->ls_top_dev, &conf); if (IS_ERR(dto)) { CDEBUG(D_OTHER, "can't alloc stripe on #%u: %d\n", - mdt->ltd_index, (int) PTR_ERR(dto)); + mdt_idx, (int) PTR_ERR(dto)); - if (mdt->ltd_discon) + if (!local_alloc && mdt->ltd_discon) tgt_connecting = 1; continue; } @@ -1041,9 +1097,15 @@ repeat_find: } up_read(<d->ltd_qos.lq_rw_sem); - if (stripe_idx > saved_idx) + if (stripe_idx > saved_idx) { + /* If there are enough MDTs, we will not actually do + * overstriping, and the hash flags should reflect this. + */ + if (!overstriped) + lo->ldo_dir_hash_type &= ~LMV_HASH_FLAG_OVERSTRIPED; /* at least one stripe is allocated */ RETURN(stripe_idx); + } /* nobody provided us with a single object */ if (tgt_connecting) @@ -1821,6 +1883,16 @@ int lod_mdt_alloc_qos(const struct lu_env *env, struct lod_object *lo, if (stripe_idx == stripe_count) RETURN(stripe_count); + /* we do not use qos for overstriping, since it will always use all the + * MDTs. So we check if it's truly needed, falling back to rr if it is, + * and otherwise we remove the flag and continue + */ + if (lo->ldo_dir_hash_type & LMV_HASH_FLAG_OVERSTRIPED) { + if (stripe_count > lod->lod_remote_mdt_count + 1) + RETURN(-EAGAIN); + lo->ldo_dir_hash_type &= ~LMV_HASH_FLAG_OVERSTRIPED; + } + /* use MDT pool in @ltd, once MDT pool is supported in the future, it * can be passed in as argument like OST object allocation. */ diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index dbb4ed4..794ed24 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -4226,7 +4226,7 @@ static void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h, if (req->rq_export->exp_disconnected) mdt_fid_unlock(h, mode); else - ptlrpc_save_lock(req, h, mode, no_ack); + ptlrpc_save_lock(req, h, no_ack); } else { mdt_fid_unlock(h, mode); } diff --git a/lustre/mdt/mdt_recovery.c b/lustre/mdt/mdt_recovery.c index 5c955ca..f8be62f 100644 --- a/lustre/mdt/mdt_recovery.c +++ b/lustre/mdt/mdt_recovery.c @@ -102,8 +102,7 @@ static void mdt_steal_ack_locks(struct ptlrpc_request *req) spin_lock(&rs->rs_lock); for (i = 0; i < rs->rs_nlocks; i++) - ptlrpc_save_lock(req, &rs->rs_locks[i], - rs->rs_modes[i], rs->rs_no_ack); + ptlrpc_save_lock(req, &rs->rs_locks[i], rs->rs_no_ack); rs->rs_nlocks = 0; DEBUG_REQ(D_HA, req, "stole locks for"); @@ -120,7 +119,7 @@ static void mdt_steal_ack_locks(struct ptlrpc_request *req) rs = req->rq_reply_state; for (i = 0; i < rs->rs_nlocks; i++) - ldlm_lock_decref(&rs->rs_locks[i], rs->rs_modes[i]); + ldlm_lock_decref(&rs->rs_locks[i], LCK_TXN); rs->rs_nlocks = 0; } diff --git a/lustre/mdt/mdt_xattr.c b/lustre/mdt/mdt_xattr.c index 655c202..b83f1ce 100644 --- a/lustre/mdt/mdt_xattr.c +++ b/lustre/mdt/mdt_xattr.c @@ -421,10 +421,11 @@ int mdt_dir_layout_update(struct mdt_thread_info *info) * directory. */ - if (lum_stripe_count > 1 && lmu->lum_hash_type && - lmu->lum_hash_type != + if (lum_stripe_count > 1 && + (lmu->lum_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK)) && + (lmu->lum_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK)) != (lmv->lmv_hash_type & cpu_to_le32(LMV_HASH_TYPE_MASK))) { - CERROR("%s: "DFID" migrate mdt hash mismatch %u != %u\n", + CERROR("%s: "DFID" migrate mdt hash mismatch %x != %x\n", mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1), lmv->lmv_hash_type, lmu->lum_hash_type); GOTO(unlock_obj, rc = -EINVAL); diff --git a/lustre/ptlrpc/layout.c b/lustre/ptlrpc/layout.c index 9dcf442..696c9dc 100644 --- a/lustre/ptlrpc/layout.c +++ b/lustre/ptlrpc/layout.c @@ -2846,7 +2846,6 @@ int req_capsule_server_grow(struct req_capsule *pill, nrs->rs_no_ack = rs->rs_no_ack; for (i = 0; i < rs->rs_nlocks; i++) { nrs->rs_locks[i] = rs->rs_locks[i]; - nrs->rs_modes[i] = rs->rs_modes[i]; nrs->rs_nlocks++; } rs->rs_nlocks = 0; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index f6b2739..9276b8d 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -179,17 +179,17 @@ static int ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) * Puts a lock and its mode into reply state assotiated to request reply. */ void ptlrpc_save_lock(struct ptlrpc_request *req, struct lustre_handle *lock, - int mode, bool no_ack) + bool no_ack) { struct ptlrpc_reply_state *rs = req->rq_reply_state; int idx; LASSERT(rs != NULL); + CDEBUG(D_RPCTRACE, "nlocks %d\n", rs->rs_nlocks); LASSERT(rs->rs_nlocks < RS_MAX_LOCKS); idx = rs->rs_nlocks++; rs->rs_locks[idx] = *lock; - rs->rs_modes[idx] = mode; rs->rs_difficult = 1; rs->rs_no_ack = no_ack; } @@ -2488,8 +2488,7 @@ static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) } while (nlocks-- > 0) - ldlm_lock_decref(&rs->rs_locks[nlocks], - rs->rs_modes[nlocks]); + ldlm_lock_decref(&rs->rs_locks[nlocks], LCK_TXN); spin_lock(&rs->rs_lock); } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index a9b6ad4..f6ab132 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -25610,32 +25610,41 @@ cleanup_test_300() { trap 0 umask $SAVE_UMASK } + test_striped_dir() { local mdt_index=$1 - local stripe_count + local stripe_count=$2 + local overstriping=$3 local stripe_index + local getstripe_count mkdir -p $DIR/$tdir SAVE_UMASK=$(umask) trap cleanup_test_300 RETURN EXIT - $LFS setdirstripe -i $mdt_index -c 2 -H all_char -o 755 \ - $DIR/$tdir/striped_dir || - error "set striped dir error" + if [ -z $overstriping ]; then + $LFS setdirstripe -i $mdt_index -c $stripe_count -H all_char \ + -o 755 $DIR/$tdir/striped_dir || + error "set striped dir error" + else + $LFS setdirstripe -i $mdt_index -C $stripe_count -H all_char \ + -o 755 $DIR/$tdir/striped_dir || + error "set striped dir error" + fi local mode=$(stat -c%a $DIR/$tdir/striped_dir) [ "$mode" = "755" ] || error "expect 755 got $mode" $LFS getdirstripe $DIR/$tdir/striped_dir > /dev/null 2>&1 || error "getdirstripe failed" - stripe_count=$($LFS getdirstripe -c $DIR/$tdir/striped_dir) - if [ "$stripe_count" != "2" ]; then - error "1:stripe_count is $stripe_count, expect 2" + getstripe_count=$($LFS getdirstripe -c $DIR/$tdir/striped_dir) + if [ "$getstripe_count" != "$stripe_count" ]; then + error "1:stripe_count is $getstripe_count, expect $stripe_count" fi - stripe_count=$($LFS getdirstripe -T $DIR/$tdir/striped_dir) - if [ "$stripe_count" != "2" ]; then - error "2:stripe_count is $stripe_count, expect 2" + getstripe_count=$($LFS getdirstripe -T $DIR/$tdir/striped_dir) + if [ "$getstripe_count" != "$stripe_count" ]; then + error "2:stripe_count is $getstripe_count, expect $stripe_count" fi stripe_index=$($LFS getdirstripe -i $DIR/$tdir/striped_dir) @@ -25684,8 +25693,8 @@ test_300a() { [ $PARALLEL == "yes" ] && skip "skip parallel run" [ $MDSCOUNT -lt 2 ] && skip_env "needs >= 2 MDTs" - test_striped_dir 0 || error "failed on striped dir on MDT0" - test_striped_dir 1 || error "failed on striped dir on MDT0" + test_striped_dir 0 2 || error "failed on striped dir on MDT0" + test_striped_dir 1 2 || error "failed on striped dir on MDT0" } run_test 300a "basic striped dir sanity test" @@ -26433,6 +26442,322 @@ test_300t() { } run_test 300t "test max_mdt_stripecount" +MDT_OVSTRP_VER="2.15.60" +# 300u family tests MDT overstriping +test_300ua() { + (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" + + local setcount=$((MDSCOUNT * 2)) + + local expected_count + + mkdir $DIR/$tdir + $LFS setdirstripe -C $setcount $DIR/$tdir/${tdir}.0 || + error "(0) failed basic overstriped dir creation test" + local getstripe_count=$($LFS getdirstripe -c $DIR/$tdir/${tdir}.0) + + # This does a basic interop test - if the MDS does not support mdt + # overstriping, we should get stripes == number of MDTs + if (( $MDS1_VERSION < $(version_code $MDT_OVSTRP_VER) )); then + expected_count=$MDSCOUNT + else + expected_count=$setcount + fi + (( getstripe_count == expected_count )) || + error "(1) incorrect stripe count for simple overstriped dir" + + rm -rf $DIR/$tdir/${tdir}.0 || + error "(2) unable to rm overstriped dir" + + # Tests after this require overstriping support + (( MDS1_VERSION >= $(version_code $MDT_OVSTRP_VER) )) || + { echo "skipped for MDS < $MDT_OVSTRP_VER"; return 0; } + + test_striped_dir 0 $setcount true || + error "(3)failed on overstriped dir" + test_striped_dir 1 $setcount true || + error "(4)failed on overstriped dir" + + local setcount=$((MDSCOUNT * $LMV_MAX_STRIPES_PER_MDT)) + + test_striped_dir 0 $setcount true || + error "(5)failed on overstriped dir" +} +run_test 300ua "basic overstriped dir sanity test" + +test_300ub() { + (( MDS1_VERSION >= $(version_code $MDT_OVSTRP_VER) )) || + skip "skipped for MDS < $MDT_OVSTRP_VER" + (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" + + mkdir $DIR/$tdir + + echo "Testing invalid stripe count, failure expected" + local setcount=$((MDSCOUNT * 2)) + + $LFS setdirstripe -c $setcount $DIR/$tdir/${tdir}.0 + local getstripe_count=$($LFS getdirstripe -c $DIR/$tdir/${tdir}.0) + + (( getstripe_count <= MDSCOUNT )) || + error "(0)stripe count ($setcount) > MDT count ($MDSCOUNT) succeeded with -c" + + # When a user requests > LMV_MAX_STRIPES_PER_MDT, we reduce to that + setcount=$((MDSCOUNT * 2 * LMV_MAX_STRIPES_PER_MDT)) + $LFS setdirstripe -C $setcount $DIR/$tdir/${tdir}.1 + + local maxcount=$((MDSCOUNT * LMV_MAX_STRIPES_PER_MDT)) + + getstripe_count=$($LFS getdirstripe -c $DIR/$tdir/${tdir}.1) + (( getstripe_count == maxcount )) || + error "(1)stripe_count is $getstripe_count, expect $maxcount" + + # Test specific striping with -i + $LFS setdirstripe -i 0,0,0,0 $DIR/$tdir/${tdir}.2 + + getstripe_count=$($LFS getdirstripe -c $DIR/$tdir/${tdir}.2) + (( getstripe_count == 4 )) || + error "(2)stripe_count is $getstripe_count, expect 4" + + local nonzeroindices=$($LFS getdirstripe $DIR/$tdir/${tdir}.2 | grep "\[" | \ + grep -v mdtidx | awk '{print $1}' | grep -c -v 0) + + [[ -n "$nonzeroindices" ]] || + error "(3) stripes indices not all 0: $nonzeroindices" + + # Test specific striping with too many stripes on one MDT + echo "Testing invalid striping, failure expected" + $LFS setdirstripe -i 0,1,0,1,0,1,0,1,0,1,0 $DIR/$tdir/${tdir}.3 + $LFS getdirstripe $DIR/$tdir/${tdir}.3 + getstripe_count=$($LFS getdirstripe $DIR/$tdir/${tdir}.3 | grep "\[" | \ + grep -v mdtidx | awk '{print $1}' | grep -c '0') + echo "stripes on MDT0: $getstripe_count" + (( getstripe_count <= LMV_MAX_STRIPES_PER_MDT )) || + error "(4) setstripe with too many stripes on MDT0 succeeded" + + setcount=$((MDSCOUNT * 2)) + $LFS setdirstripe -C $setcount -H all_char $DIR/${tdir}.4 || + error "(5) can't setdirstripe with manually set hash function" + + getstripe_count=$($LFS getdirstripe -c $DIR/${tdir}.4) + (( getstripe_count == setcount )) || + error "(6)stripe_count is $getstripe_count, expect $setcount" + + setcount=$((MDSCOUNT * 2)) + mkdir $DIR/${tdir}.5 + $LFS setdirstripe -C $setcount -D -H crush $DIR/${tdir}.5 || + error "(7) can't setdirstripe with manually set hash function" + mkdir $DIR/${tdir}.5/${tdir}.6 + + getstripe_count=$($LFS getdirstripe -c $DIR/${tdir}.5/${tdir}.6) + (( getstripe_count == setcount )) || + error "(8)stripe_count is $getstripe_count, expect $setcount" +} +run_test 300ub "test MDT overstriping interface & limits" + +test_300uc() { + (( MDS1_VERSION >= $(version_code $MDT_OVSTRP_VER) )) || + skip "skipped for MDS < $MDT_OVSTRP_VER" + (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" + + mkdir $DIR/$tdir + + local setcount=$((MDSCOUNT * 2)) + + $LFS setdirstripe -D -C $setcount $DIR/$tdir + + mkdir $DIR/$tdir/${tdir}.1 + + local getstripe_count=$($LFS getdirstripe -c $DIR/$tdir/${tdir}.1) + + (( getstripe_count == setcount )) || + error "(0)stripe_count is $getstripe_count, expect $setcount" + + mkdir $DIR/$tdir/${tdir}.1/${tdir}.2 + + local getstripe_count=$($LFS getdirstripe -c \ + $DIR/$tdir/${tdir}.1/${tdir}.2) + + (( getstripe_count == setcount )) || + error "(1)stripe_count is $getstripe_count, expect $setcount" +} +run_test 300uc "test MDT overstriping as default & inheritance" + +test_300ud() { + (( MDS1_VERSION >= $(version_code $MDT_OVSTRP_VER) )) || + skip "skipped for MDS < $MDT_OVSTRP_VER" + (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" + + local mdts=$(comma_list $(mdts_nodes)) + local timeout=100 + + local restripe_status + local delta + local i + + [[ $mds1_FSTYPE == zfs ]] && timeout=300 + + # in case "crush" hash type is not set + do_nodes $mdts "$LCTL set_param lod.*.mdt_hash=crush" + + restripe_status=$(do_facet mds1 $LCTL get_param -n \ + mdt.*MDT0000.enable_dir_restripe) + do_nodes $mdts "$LCTL set_param mdt.*.enable_dir_restripe=1" + stack_trap "do_nodes $mdts $LCTL set_param \ + mdt.*.enable_dir_restripe=$restripe_status" + + mkdir $DIR/$tdir + createmany -m $DIR/$tdir/f $((50 * MDSCOUNT)) || + error "create files under remote dir failed $i" + createmany -d $DIR/$tdir/d $((50 * MDSCOUNT)) || + error "create dirs under remote dir failed $i" + + local setcount=$((MDSCOUNT * $LMV_MAX_STRIPES_PER_MDT)) + + (( setcount < 13 )) || setcount=12 + for i in $(seq 2 $setcount); do + do_nodes $mdts "$LCTL set_param mdt.*.md_stats=clear >/dev/null" + $LFS setdirstripe -C $i $DIR/$tdir || + error "split -C $i $tdir failed" + wait_update $HOSTNAME \ + "$LFS getdirstripe -H $DIR/$tdir" "crush" $timeout || + error "dir split not finished" + delta=$(do_nodes $mdts "lctl get_param -n mdt.*MDT*.md_stats" | + awk '/migrate/ {sum += $2} END { print sum }') + echo "$delta migrated when dir split $((i - 1)) to $i stripes" + # delta is around total_files/stripe_count, deviation 3% + (( delta < 100 * MDSCOUNT / i + 3 * MDSCOUNT )) || + error "$delta files migrated >= $((100 * MDSCOUNT / i + 3 * MDSCOUNT))" + done +} +run_test 300ud "dir split" + +test_300ue() { + (( MDS1_VERSION >= $(version_code $MDT_OVSTRP_VER) )) || + skip "skipped for MDS < $MDT_OVSTRP_VER" + (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" + + local mdts=$(comma_list $(mdts_nodes)) + local timeout=100 + + local restripe_status + local delta + local c + + [[ $mds1_FSTYPE == zfs ]] && timeout=300 + + do_nodes $mdts "$LCTL set_param lod.*.mdt_hash=crush" + + restripe_status=$(do_facet mds1 $LCTL get_param -n \ + mdt.*MDT0000.enable_dir_restripe) + do_nodes $mdts "$LCTL set_param mdt.*.enable_dir_restripe=1" + stack_trap "do_nodes $mdts $LCTL set_param \ + mdt.*.enable_dir_restripe=$restripe_status" + + local setcount=$((MDSCOUNT * $LMV_MAX_STRIPES_PER_MDT)) + + (( setcount < 13 )) || setcount=12 + test_mkdir -C $setcount -H crush $DIR/$tdir + createmany -m $DIR/$tdir/f $((50 * MDSCOUNT)) || + error "create files under remote dir failed" + createmany -d $DIR/$tdir/d $((50 * MDSCOUNT)) || + error "create dirs under remote dir failed" + + for c in $(seq $((setcount - 1)) -1 1); do + do_nodes $mdts "$LCTL set_param mdt.*.md_stats=clear >/dev/null" + $LFS setdirstripe -C $c $DIR/$tdir || + error "split -C $c $tdir failed" + wait_update $HOSTNAME \ + "$LFS getdirstripe -H $DIR/$tdir" "crush,fixed" $timeout || + error "dir merge not finished" + delta=$(do_nodes $mdts "lctl get_param -n mdt.*MDT*.md_stats" | + awk '/migrate/ {sum += $2} END { print sum }') + echo "$delta migrated when dir merge $((c + 1)) to $c stripes" + # delta is around total_files/stripe_count, deviation 3% + (( delta < 100 * MDSCOUNT / c + 3 * MDSCOUNT )) || + error "$delta files migrated >= $((100 * MDSCOUNT / c + 3 * MDSCOUNT))" + done +} +run_test 300ue "dir merge" + +test_300uf() { + (( MDS1_VERSION >= $(version_code $MDT_OVSTRP_VER) )) || + skip "skipped for MDS < $MDT_OVSTRP_VER" + (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" + + # maximum amount of local locks: + # parent striped dir - 2 locks + # new stripe in parent to migrate to - 1 lock + # source and target - 2 locks + # Total 5 locks for regular file + # + # NB: Overstriping should add several extra local locks + # FIXME: Remove this once understood + #lctl set_param *debug=-1 debug_mb=10000 + lctl clear + lctl mark "touch/create" + mkdir -p $DIR/$tdir + local setcount=$((MDSCOUNT * $LMV_MAX_STRIPES_PER_MDT)) + local setcount=$((MDSCOUNT * 5)) + + $LFS mkdir -i1 -C $setcount $DIR/$tdir/dir1 + touch $DIR/$tdir/dir1/eee + + lctl mark "hardlinks" + # create 4 hardlink for 4 more locks + # Total: 9 locks > RS_MAX_LOCKS (8) + $LFS mkdir -i1 -c1 $DIR/$tdir/dir2 + $LFS mkdir -i1 -c1 $DIR/$tdir/dir3 + $LFS mkdir -i1 -c1 $DIR/$tdir/dir4 + $LFS mkdir -i1 -c1 $DIR/$tdir/dir5 + ln $DIR/$tdir/dir1/eee $DIR/$tdir/dir2/eee + ln $DIR/$tdir/dir1/eee $DIR/$tdir/dir3/eee + ln $DIR/$tdir/dir1/eee $DIR/$tdir/dir4/eee + ln $DIR/$tdir/dir1/eee $DIR/$tdir/dir5/eee + + lctl mark "cancel lru" + cancel_lru_locks mdc + + lctl mark "migrate" + $LFS migrate -m1 -c1 $DIR/$tdir/dir1 || + error "migrate dir fails" + + rm -rf $DIR/$tdir || error "rm dir failed after migration" +} +run_test 300uf "migrate with too many local locks" + +test_300ug() { + (( MDS1_VERSION >= $(version_code $MDT_OVSTRP_VER) )) || + skip "skipped for MDS < $MDT_OVSTRP_VER" + (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs" + + mkdir -p $DIR/$tdir + local migrate_dir=$DIR/$tdir/migrate_dir + local setcount=$((MDSCOUNT * $LMV_MAX_STRIPES_PER_MDT)) + local setcount2=$((setcount - 2)) + + $LFS setdirstripe -c 2 $migrate_dir || + error "(0) failed to create striped directory" + + $LFS migrate -m 0 -C $setcount $migrate_dir || + error "(1)failed to migrate to overstriped directory" + local getstripe_count=$($LFS getdirstripe -c $migrate_dir) + + (( getstripe_count == setcount )) || + error "(2)stripe_count is $getstripe_count, expect $setcount" + touch $DIR/$tdir/migrate_dir/$tfile || + error "(3)failed to create file in overstriped directory" + $LFS migrate -m 0 -C $setcount2 $migrate_dir || + error "(4)failed to migrate overstriped directory" + # Check stripe count after migration + $LFS getdirstripe $migrate_dir + getstripe_count=$($LFS getdirstripe -c $migrate_dir) + (( getstripe_count == setcount2 )) || + error "(5)stripe_count is $getstripe_count, expect $setcount2" + + rm -rf $migrate_dir || error "(6) unable to rm overstriped dir" +} +run_test 300ug "migrate overstriped dirs" + prepare_remote_file() { mkdir $DIR/$tdir/src_dir || error "create remote source failed" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 3d2b29c..b4b0830 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -649,6 +649,7 @@ init_test_env() { # Constants used in more than one test script export LOV_MAX_STRIPE_COUNT=2000 + export LMV_MAX_STRIPES_PER_MDT=5 export DELETE_OLD_POOLS=${DELETE_OLD_POOLS:-false} export KEEP_POOLS=${KEEP_POOLS:-false} export PARALLEL=${PARALLEL:-"no"} @@ -9735,13 +9736,16 @@ test_mkdir() { local dirstripe_count=${DIRSTRIPE_COUNT:-"2"} local dirstripe_index=${DIRSTRIPE_INDEX:-$((base % $MDSCOUNT))} local OPTIND=1 + local overstripe_count + local stripe_command="-c" (( $MDS1_VERSION > $(version_code 2.15.0) )) && hash_name+=("crush2") - while getopts "c:H:i:p" opt; do + while getopts "c:C:H:i:p" opt; do case $opt in c) dirstripe_count=$OPTARG;; + C) overstripe_count=$OPTARG;; H) hash_type=$OPTARG;; i) dirstripe_index=$OPTARG;; p) p_option="-p";; @@ -9762,6 +9766,11 @@ test_mkdir() { fi fi + if [[ -n "$overstripe_count" ]]; then + stripe_command="-C" + dirstripe_count=$overstripe_count + fi + if [ $MDSCOUNT -le 1 ] || ! is_lustre ${parent}; then mkdir $path || error "mkdir '$path' failed" else @@ -9785,9 +9794,9 @@ test_mkdir() { dirstripe_count=1 fi - echo "striped dir -i$mdt_index -c$dirstripe_count -H $hash_type $path" - $LFS mkdir -i$mdt_index -c$dirstripe_count -H $hash_type $path || - error "mkdir -i $mdt_index -c$dirstripe_count -H $hash_type $path failed" + echo "striped dir -i$mdt_index $stripe_command$dirstripe_count -H $hash_type $path" + $LFS mkdir -i$mdt_index $stripe_command$dirstripe_count -H $hash_type $path || + error "mkdir -i $mdt_index $stripe_command$dirstripe_count -H $hash_type $path failed" fi } diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 6bf2621..88713f2 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -229,6 +229,7 @@ static inline int lfs_mirror_delete(int argc, char **argv) " [--mdt-count|-c stripe_count>\n" \ " [--help|-h] [--mdt-hash|-H mdt_hash]\n" \ " [--mdt-index|-i mdt_index[,mdt_index,...]\n" \ + " [--mdt-overcount|-C stripe_count>\n" \ " [--default|-D] [--mode|-o mode]\n" \ " [--max-inherit|-X max_inherit]\n" \ " [--max-inherit-rr max_inherit_rr] \n" \ @@ -502,8 +503,9 @@ command_t cmdlist[] = { "usage: swap_layouts "}, {"migrate", lfs_setstripe_migrate, 0, "migrate directories and their inodes between MDTs.\n" - "usage: migrate [--mdt-count|-c STRIPE_COUNT] [--directory|-d]\n" - " [--mdt-hash|-H HASH_TYPE]\n" + "usage: migrate [--mdt-count|-c STRIPE_COUNT]\n" + " [--mdt-overcount|-C OVERSTRIPE_COUNT\n" + " [--directory|-d] [--mdt-hash|-H HASH_TYPE]\n" " [--mdt-index|-m START_MDT_INDEX] [--verbose|-v]\n" " DIRECTORY\n" "\n" @@ -2570,19 +2572,18 @@ out: * \param[in] size size of \a tgts array * \param[in] offset starting index in \a tgts * \param[in] arg string containing OST index list - * \param[in/out] overstriping index list may contain duplicates + * \param[out] duplicates tell caller list contains duplicates * * \retval positive number of indices in \a tgts * \retval -EINVAL unable to parse \a arg */ static int parse_targets(__u32 *tgts, int size, int offset, char *arg, - unsigned long long *pattern) + bool *duplicates) { int rc; int nr = offset; int slots = size - offset; char *ptr = NULL; - bool overstriped = false; bool end_of_loop; if (!arg) @@ -2623,13 +2624,10 @@ static int parse_targets(__u32 *tgts, int size, int offset, char *arg, for (i = start_index; i <= end_index && slots > 0; i++) { int j; - /* remove duplicate */ + /* note presence of duplicates */ for (j = 0; j < offset; j++) { - if (tgts[j] == i && pattern && - *pattern == LLAPI_LAYOUT_OVERSTRIPING) - overstriped = true; - else if (tgts[j] == i) - return -EINVAL; + if (tgts[j] == i) + *duplicates = true; } j = offset; @@ -2651,9 +2649,6 @@ static int parse_targets(__u32 *tgts, int size, int offset, char *arg, if (!end_of_loop && ptr) *ptr = ','; - if (!overstriped && pattern) - *pattern = LLAPI_LAYOUT_DEFAULT; - return rc < 0 ? rc : nr; } @@ -3541,6 +3536,7 @@ static int lfs_setstripe_internal(int argc, char **argv, char *template = NULL; bool foreign_mode = false; char *xattr = NULL; + bool overstriped = false; uint32_t type = LU_FOREIGN_TYPE_NONE, flags = 0; char *mode_opt = NULL; mode_t previous_umask = 0; @@ -3595,6 +3591,8 @@ static int lfs_setstripe_internal(int argc, char **argv, { .val = 'c', .name = "mdt-count", .has_arg = required_argument}, { .val = 'C', .name = "overstripe-count", .has_arg = required_argument}, + { .val = 'C', .name = "mdt-overcount", + .has_arg = required_argument}, { .val = 'd', .name = "delete", .has_arg = no_argument}, { .val = 'd', .name = "destroy", .has_arg = no_argument}, /* used with "lfs migrate -m" */ @@ -3826,7 +3824,10 @@ static int lfs_setstripe_internal(int argc, char **argv, progname, argv[0]); goto usage_error; } - lsa.lsa_pattern = LLAPI_LAYOUT_OVERSTRIPING; + if (migrate_mode) + overstriped = true; + else + lsa.lsa_pattern = LLAPI_LAYOUT_OVERSTRIPING; fallthrough; case 'c': errno = 0; @@ -3996,8 +3997,9 @@ static int lfs_setstripe_internal(int argc, char **argv, } migrate_mdt_mode = true; lsa.lsa_nr_tgts = parse_targets(tgts, - sizeof(tgts) / sizeof(__u32), - lsa.lsa_nr_tgts, optarg, NULL); + sizeof(tgts) / sizeof(__u32), + lsa.lsa_nr_tgts, optarg, + &overstriped); if (lsa.lsa_nr_tgts < 0) { fprintf(stderr, "%s: invalid MDT target(s) '%s'\n", @@ -4082,11 +4084,10 @@ static int lfs_setstripe_internal(int argc, char **argv, * parse_targets is shared with MDT striping, which * does not allow duplicates */ - lsa.lsa_pattern = LLAPI_LAYOUT_OVERSTRIPING; lsa.lsa_nr_tgts = parse_targets(tgts, sizeof(tgts) / sizeof(__u32), lsa.lsa_nr_tgts, optarg, - &lsa.lsa_pattern); + &overstriped); if (lsa.lsa_nr_tgts < 0) { fprintf(stderr, "%s %s: invalid OST target(s) '%s'\n", @@ -4094,6 +4095,9 @@ static int lfs_setstripe_internal(int argc, char **argv, goto usage_error; } + if (overstriped) + lsa.lsa_pattern = LLAPI_LAYOUT_OVERSTRIPING; + lsa.lsa_tgts = tgts; if (lsa.lsa_stripe_off == LLAPI_LAYOUT_DEFAULT) lsa.lsa_stripe_off = tgts[0]; @@ -4369,10 +4373,15 @@ static int lfs_setstripe_internal(int argc, char **argv, goto usage_error; } lmu->lum_stripe_offset = lsa.lsa_stripe_off; + if (lsa.lsa_pattern != LLAPI_LAYOUT_RAID0) lmu->lum_hash_type = lsa.lsa_pattern; else lmu->lum_hash_type = LMV_HASH_TYPE_UNKNOWN; + + if (overstriped) + lmu->lum_hash_type |= LMV_HASH_FLAG_OVERSTRIPED; + if (lsa.lsa_pool_name) { strncpy(lmu->lum_pool_name, lsa.lsa_pool_name, sizeof(lmu->lum_pool_name) - 1); @@ -6956,6 +6965,7 @@ static int lfs_setdirstripe(int argc, char **argv) bool delete = false; bool foreign_mode = false; bool mdt_count_set = false; + bool overstriped = false; mode_t mode = S_IRWXU | S_IRWXG | S_IRWXO; mode_t previous_mode = 0; char *xattr = NULL; @@ -6965,6 +6975,7 @@ static int lfs_setdirstripe(int argc, char **argv) struct option long_opts[] = { { .val = 'c', .name = "count", .has_arg = required_argument }, { .val = 'c', .name = "mdt-count", .has_arg = required_argument }, + { .val = 'C', .name = "mdt-overcount", .has_arg = required_argument }, { .val = 'd', .name = "delete", .has_arg = no_argument }, { .val = 'D', .name = "default", .has_arg = no_argument }, { .val = 'D', .name = "default_stripe", .has_arg = no_argument }, @@ -7001,13 +7012,17 @@ static int lfs_setdirstripe(int argc, char **argv) setstripe_args_init(&lsa); - while ((c = getopt_long(argc, argv, "c:dDi:hH:m:o:t:T:x:X:", + while ((c = getopt_long(argc, argv, "c:C:dDi:hH:m:o:t:T:x:X:", long_opts, NULL)) >= 0) { switch (c) { case 0: /* Long options. */ break; + case 'C': + overstriped = true; + fallthrough; case 'c': + fallthrough; case 'T': errno = 0; lsa.lsa_stripe_count = strtoul(optarg, &end, 0); @@ -7095,7 +7110,8 @@ static int lfs_setdirstripe(int argc, char **argv) #endif lsa.lsa_nr_tgts = parse_targets(mdts, sizeof(mdts) / sizeof(__u32), - lsa.lsa_nr_tgts, optarg, NULL); + lsa.lsa_nr_tgts, optarg, + &overstriped); if (lsa.lsa_nr_tgts < 0) { fprintf(stderr, "%s %s: invalid MDT target(s) '%s'\n", @@ -7303,10 +7319,17 @@ static int lfs_setdirstripe(int argc, char **argv) param->lsp_stripe_offset = LMV_OFFSET_DEFAULT; else param->lsp_stripe_offset = lsa.lsa_stripe_off; + if (lsa.lsa_pattern != LLAPI_LAYOUT_RAID0) param->lsp_stripe_pattern = lsa.lsa_pattern; else param->lsp_stripe_pattern = LMV_HASH_TYPE_UNKNOWN; + + if (overstriped) { + param->lsp_stripe_pattern |= LMV_HASH_FLAG_OVERSTRIPED; + max_inherit = LMV_INHERIT_DEFAULT_OVERSTRIPED; + } + param->lsp_pool = lsa.lsa_pool_name; param->lsp_is_specific = false; diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 0834fec..f996a56 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -3048,10 +3048,7 @@ static void lmv_dump_user_lmm(struct lmv_user_md *lum, char *pool_name, llapi_printf(LLAPI_MSG_NORMAL, ",lost_lmv"); if (flags & LMV_HASH_FLAG_FIXED) llapi_printf(LLAPI_MSG_NORMAL, ",fixed"); - /* NB: OVERSTRIPED is not in KNOWN until implementation patch - * is landed, but we do recognize it - */ - if (flags & ~(LMV_HASH_FLAG_KNOWN|LMV_HASH_FLAG_OVERSTRIPED)) + if (flags & ~LMV_HASH_FLAG_KNOWN) llapi_printf(LLAPI_MSG_NORMAL, ",unknown_%04x", flags & ~LMV_HASH_FLAG_KNOWN); -- 1.8.3.1