From: Lai Siyao Date: Tue, 7 Sep 2021 09:33:21 +0000 (-0400) Subject: LU-13076 dne: dir migrate in QOS mode X-Git-Tag: 2.14.56~135 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=378c7567876b430d06031f7d380112b9bdb15166 LU-13076 dne: dir migrate in QOS mode Support "lfs migrate -m -1 ..." to migrate directory to MDTs by space and inode usage, if system is balanced, the target MDT is chosen in roundrobin mode, otherwise the less full MDTs will be chosen, and the most full MDT is avoided. Another minor change: if directory is migrated to specific MDTs, and the target stripe count is more than 1, its subdirs may not be migrated to the specified MDT in the command, but migrated to the MDT where its parent stripe is located (subdir will be striped too), as can avoid unnecessary remote directories. NB, for command like "lfs migrate -m 0,1,2 ...", though the subdir may be located on either MDT0, MDT1 or MDT2, its stripes will be striped over these three MDTs, but for command like "lfs migrate -m 0 -c 3...", the subdir may be striped on other MDTs if the subdir is not located on MDT0. Add sanity 230u. Signed-off-by: Lai Siyao Change-Id: I6e9c3d75bfc240b21c65ba27cd5e4bcca7058325 Reviewed-on: https://review.whamcloud.com/44886 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Hongchao Zhang Reviewed-by: Oleg Drokin --- diff --git a/lustre/doc/lfs-migrate.1 b/lustre/doc/lfs-migrate.1 index 0725c1d..fb72541 100644 --- a/lustre/doc/lfs-migrate.1 +++ b/lustre/doc/lfs-migrate.1 @@ -89,7 +89,9 @@ Directory will be migrated to MDTs starting with , or specific MDTs if multiple MDTs are specified in a comma-seperated list. This is useful if new MDTs have been added to a filesystem and existing user or project directories should be migrated off old MDTs to balance the space usage -and future metadata workload. +and future metadata workload. If +.I start_mdt_index +is set to -1, the MDT will be chosen by space and inode usage. .TP .BR -c , --mdt-count=\fICOUNT\fR Directory will be migrated to diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 96b6b5d..64d19e3 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -1459,7 +1459,7 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, RETURN(rc); } -static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt, +static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 mdt, unsigned short dir_depth) { struct lu_tgt_desc *tgt, *cur = NULL; @@ -1492,7 +1492,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt, tgt->ltd_qos.ltq_usable = 1; lu_tgt_qos_weight_calc(tgt); - if (tgt->ltd_index == *mdt) + if (tgt->ltd_index == mdt) cur = tgt; total_avail += tgt->ltd_qos.ltq_avail; total_weight += tgt->ltd_qos.ltq_weight; @@ -1507,7 +1507,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt, (total_usable * 256 * (1 + dir_depth / 4)); if (cur && cur->ltd_qos.ltq_avail >= rand) { tgt = cur; - GOTO(unlock, rc = 0); + GOTO(unlock, tgt); } rand = lu_prandom_u64_max(total_weight); @@ -1520,9 +1520,8 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt, if (cur_weight < rand) continue; - *mdt = tgt->ltd_index; ltd_qos_update(&lmv->lmv_mdt_descs, tgt, &total_weight); - GOTO(unlock, rc = 0); + GOTO(unlock, tgt); } /* no proper target found */ @@ -1533,7 +1532,7 @@ unlock: return tgt; } -static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt) +static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv) { struct lu_tgt_desc *tgt; int i; @@ -1549,8 +1548,7 @@ static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt) if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) continue; - *mdt = tgt->ltd_index; - lmv->lmv_qos_rr_index = (*mdt + 1) % + lmv->lmv_qos_rr_index = (tgt->ltd_index + 1) % lmv->lmv_mdt_descs.ltd_tgts_size; spin_unlock(&lmv->lmv_lock); @@ -1561,6 +1559,65 @@ static struct lu_tgt_desc *lmv_locate_tgt_rr(struct lmv_obd *lmv, __u32 *mdt) RETURN(ERR_PTR(-ENODEV)); } +/* locate MDT which is less full (avoid the most full MDT) */ +static struct lu_tgt_desc *lmv_locate_tgt_lf(struct lmv_obd *lmv) +{ + struct lu_tgt_desc *min = NULL; + struct lu_tgt_desc *tgt; + __u64 avail = 0; + __u64 rand; + + ENTRY; + + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) + RETURN(ERR_PTR(-EAGAIN)); + + down_write(&lmv->lmv_qos.lq_rw_sem); + + if (!ltd_qos_is_usable(&lmv->lmv_mdt_descs)) + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); + + lmv_foreach_tgt(lmv, tgt) { + if (!tgt->ltd_exp || !tgt->ltd_active) { + tgt->ltd_qos.ltq_usable = 0; + continue; + } + + tgt->ltd_qos.ltq_usable = 1; + lu_tgt_qos_weight_calc(tgt); + avail += tgt->ltd_qos.ltq_avail; + if (!min || min->ltd_qos.ltq_avail > tgt->ltd_qos.ltq_avail) + min = tgt; + } + + /* avoid the most full MDT */ + if (min) + avail -= min->ltd_qos.ltq_avail; + + rand = lu_prandom_u64_max(avail); + avail = 0; + lmv_foreach_connected_tgt(lmv, tgt) { + if (!tgt->ltd_qos.ltq_usable) + continue; + + if (tgt == min) + continue; + + avail += tgt->ltd_qos.ltq_avail; + if (avail < rand) + continue; + + GOTO(unlock, tgt); + } + + /* no proper target found */ + GOTO(unlock, tgt = ERR_PTR(-EAGAIN)); +unlock: + up_write(&lmv->lmv_qos.lq_rw_sem); + + RETURN(tgt); +} + /* locate MDT by file name, for striped directory, the file name hash decides * which stripe its dirent is stored. */ @@ -1873,7 +1930,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, } else if (lmv_op_qos_mkdir(op_data)) { struct lmv_tgt_desc *tmp = tgt; - tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds, + tgt = lmv_locate_tgt_qos(lmv, op_data->op_mds, op_data->op_dir_depth); if (tgt == ERR_PTR(-EAGAIN)) { if (ltd_qos_is_balanced(&lmv->lmv_mdt_descs) && @@ -1884,11 +1941,12 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, */ tgt = tmp; else - tgt = lmv_locate_tgt_rr(lmv, &op_data->op_mds); + tgt = lmv_locate_tgt_rr(lmv); } if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); + op_data->op_mds = tgt->ltd_index; /* * only update statfs after QoS mkdir, this means the cached * statfs may be stale, and current mkdir may not follow QoS @@ -2105,6 +2163,53 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, RETURN(rc); } +/* migrate the top directory */ +static inline bool lmv_op_topdir_migrate(const struct md_op_data *op_data) +{ + if (!S_ISDIR(op_data->op_mode)) + return false; + + if (lmv_dir_layout_changing(op_data->op_mea1)) + return false; + + return true; +} + +/* migrate top dir to specific MDTs */ +static inline bool lmv_topdir_specific_migrate(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + if (!lmv_op_topdir_migrate(op_data)) + return false; + + return le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT; +} + +/* migrate top dir in QoS mode if user issued "lfs migrate -m -1..." */ +static inline bool lmv_topdir_qos_migrate(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + if (!lmv_op_topdir_migrate(op_data)) + return false; + + return le32_to_cpu(lum->lum_stripe_offset) == LMV_OFFSET_DEFAULT; +} + +static inline bool lmv_subdir_specific_migrate(const struct md_op_data *op_data) +{ + const struct lmv_user_md *lum = op_data->op_data; + + if (!S_ISDIR(op_data->op_mode)) + return false; + + if (!lmv_dir_layout_changing(op_data->op_mea1)) + return false; + + return le32_to_cpu(lum->lum_stripe_offset) != LMV_OFFSET_DEFAULT; +} + static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, const char *name, size_t namelen, struct ptlrpc_request **request) @@ -2171,19 +2276,56 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, if (IS_ERR(child_tgt)) RETURN(PTR_ERR(child_tgt)); - /* for directory, migrate to MDT specified by lum_stripe_offset; - * otherwise migrate to the target stripe of parent, but parent - * directory may have finished migration (normally current file too), - * allocate FID on MDT lum_stripe_offset, and server will check - * whether file was migrated already. - */ - if (S_ISDIR(op_data->op_mode) || !tp_tgt) { + if (lmv_topdir_specific_migrate(op_data)) { struct lmv_user_md *lum = op_data->op_data; op_data->op_mds = le32_to_cpu(lum->lum_stripe_offset); - } else { + } else if (lmv_topdir_qos_migrate(op_data)) { + tgt = lmv_locate_tgt_lf(lmv); + if (tgt == ERR_PTR(-EAGAIN)) + tgt = lmv_locate_tgt_rr(lmv); + if (IS_ERR(tgt)) + RETURN(PTR_ERR(tgt)); + + op_data->op_mds = tgt->ltd_index; + } else if (lmv_subdir_specific_migrate(op_data)) { + struct lmv_user_md *lum = op_data->op_data; + __u32 i; + + LASSERT(tp_tgt); + if (le32_to_cpu(lum->lum_magic) == LMV_USER_MAGIC_SPECIFIC) { + /* adjust MDTs in lum, since subdir is located on where + * its parent stripe is, not the first specified MDT. + */ + for (i = 0; i < le32_to_cpu(lum->lum_stripe_count); + i++) { + if (le32_to_cpu(lum->lum_objects[i].lum_mds) == + tp_tgt->ltd_index) + break; + } + + if (i == le32_to_cpu(lum->lum_stripe_count)) + RETURN(-ENODEV); + + lum->lum_objects[i].lum_mds = + lum->lum_objects[0].lum_mds; + lum->lum_objects[0].lum_mds = + cpu_to_le32(tp_tgt->ltd_index); + } + /* NB, the above adjusts subdir migration for command like + * "lfs migrate -m 0,1,2 ...", but for migration like + * "lfs migrate -m 0 -c 2 ...", the top dir is migrated to MDT0 + * and MDT1, however its subdir may be migrated to MDT1 and MDT2 + */ + + lum->lum_stripe_offset = cpu_to_le32(tp_tgt->ltd_index); + op_data->op_mds = tp_tgt->ltd_index; + } else if (tp_tgt) { op_data->op_mds = tp_tgt->ltd_index; + } else { + op_data->op_mds = sp_tgt->ltd_index; } + rc = lmv_fid_alloc(NULL, exp, &target_fid, op_data); if (rc) RETURN(rc); diff --git a/lustre/mdt/mdt_xattr.c b/lustre/mdt/mdt_xattr.c index 59b3374..819086b 100644 --- a/lustre/mdt/mdt_xattr.c +++ b/lustre/mdt/mdt_xattr.c @@ -422,13 +422,10 @@ int mdt_dir_layout_update(struct mdt_thread_info *info) GOTO(unlock_obj, rc = -EINVAL); } - if (lmu->lum_stripe_offset != lmv->lmv_master_mdt_index) { - CERROR("%s: "DFID" migrate mdt index mismatch %u != %u\n", - mdt_obd_name(info->mti_mdt), PFID(rr->rr_fid1), - lmv->lmv_master_mdt_index, - lmu->lum_stripe_offset); - GOTO(unlock_obj, rc = -EINVAL); - } + /* lum_stripe_offset is not checked, because subdir is migrated + * to where its parent is located to avoid unnecessary remote + * directory. + */ if (lum_stripe_count > 1 && lmu->lum_hash_type && lmu->lum_hash_type != diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 6f1e2aa..ccdda90 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -19644,7 +19644,7 @@ test_230d() { error "migrate remote dir error" echo "Finish migration, then checking.." - for file in $(find $migrate_dir); do + for file in $(find $migrate_dir -maxdepth 1); do mdt_index=$($LFS getstripe -m $file) if [ $mdt_index -lt $new_index ] || [ $mdt_index -gt $((new_index + new_count - 1)) ]; then @@ -20201,6 +20201,48 @@ test_230t() } run_test 230t "migrate directory with project ID set" +test_230u() +{ + (( MDSCOUNT > 3 )) || skip_env "needs >= 4 MDTs" + (( MDS1_VERSION >= $(version_code 2.14.53) )) || + skip "Need MDS version at least 2.14.53" + + local count + + mkdir_on_mdt0 $DIR/$tdir || error "mkdir $tdir failed" + mkdir $DIR/$tdir/sub{0..99} || error "mkdir sub failed" + $LFS migrate -m -1 $DIR/$tdir/sub{0..99} || error "migrate sub failed" + for i in $(seq 0 $((MDSCOUNT - 1))); do + count=$($LFS getstripe -m $DIR/$tdir/sub* | grep -c ^$i) + echo "$count dirs migrated to MDT$i" + done + count=$($LFS getstripe -m $DIR/$tdir/sub* | sort -u | wc -l) + (( count >= MDSCOUNT - 1 )) || error "dirs migrated to $count MDTs" +} +run_test 230u "migrate directory by QOS" + +test_230v() +{ + (( MDSCOUNT > 3 )) || skip_env "needs >= 4 MDTs" + (( MDS1_VERSION >= $(version_code 2.14.53) )) || + skip "Need MDS version at least 2.14.53" + + local count + + mkdir $DIR/$tdir || error "mkdir $tdir failed" + mkdir $DIR/$tdir/sub{0..99} || error "mkdir sub failed" + $LFS migrate -m 0,2,1 $DIR/$tdir || error "migrate $tdir failed" + for i in $(seq 0 $((MDSCOUNT - 1))); do + count=$($LFS getstripe -m $DIR/$tdir/sub* | grep -c ^$i) + echo "$count subdirs migrated to MDT$i" + (( i == 3 )) && (( count > 0 )) && + error "subdir shouldn't be migrated to MDT3" + done + count=$($LFS getstripe -m $DIR/$tdir/sub* | sort -u | wc -l) + (( count == 3 )) || error "dirs migrated to $count MDTs" +} +run_test 230v "subdir migrated to the MDT where its parent is located" + test_231a() { # For simplicity this test assumes that max_pages_per_rpc