From b9c4dc3c33fe87ecaa79a290190524ea21b7fa8a Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Mon, 21 Jun 2021 11:52:01 +0800 Subject: [PATCH] LU-14792 llite: enable filesystem-wide default LMV This change includes three parts: 1. save dir depth to ROOT after lookup on client side. 2. once space balanced default LMV is set on ROOT, and max-inherit/max-inherit-rr is unlimited or not less than directory depth, new directory will be created in QOS or roundrobin mode. 3. set ROOT default LMV max-inherit unlimited, and max-inherit-rr to 3, and increase the ratio to create subdirectory on local MDT with the directory depth to ROOT, so that new directories will be created by space usage, and the deeper it's located it's more likely to create on local MDTs; and the top 3 layer will be created in roundrobin mode if system is balanced. Set default LMV in mkdir_on_mdt() to make sure its subdirectories are created on the same MDT. Add sanity 413d. Create a test directory on MDT0 for pjdfstest, because cross-MDT rename of symlink will migrate symlink to target MDT, which will cause inode change (LU-11631). Signed-off-by: Lai Siyao Change-Id: Ib3a133ac99655ca04443b9498e6618033f6b88b9 Reviewed-on: https://review.whamcloud.com/44090 Reviewed-by: Andreas Dilger Tested-by: jenkins Reviewed-by: Hongchao Zhang Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/include/obd.h | 10 +++- lustre/include/uapi/linux/lustre/lustre_user.h | 2 + lustre/llite/dir.c | 2 + lustre/llite/file.c | 5 +- lustre/llite/llite_internal.h | 5 +- lustre/llite/llite_lib.c | 16 +++++++ lustre/llite/namei.c | 63 +++++++++++++++++++++++++- lustre/llite/statahead.c | 5 +- lustre/lmv/lmv_obd.c | 32 +++++++------ lustre/lmv/lproc_lmv.c | 26 ++++++++++- lustre/lod/lod_object.c | 20 +------- lustre/lod/lproc_lod.c | 14 +++++- lustre/mdd/mdd_device.c | 2 + lustre/tests/pjdfstest.sh | 4 +- lustre/tests/sanity.sh | 27 +++++++++++ lustre/tests/test-framework.sh | 2 + 16 files changed, 192 insertions(+), 43 deletions(-) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index e4daa4c..1eacd53 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -838,6 +838,8 @@ enum md_op_flags { MF_MDC_CANCEL_FID4 = BIT(3), MF_GET_MDT_IDX = BIT(4), MF_GETATTR_BY_FID = BIT(5), + MF_QOS_MKDIR = BIT(6), + MF_RR_MKDIR = BIT(7), }; enum md_cli_flags { @@ -923,8 +925,12 @@ struct md_op_data { __u32 op_projid; - /* Used by readdir */ - unsigned int op_max_pages; + union { + /* Used by readdir */ + unsigned int op_max_pages; + /* mkdir */ + unsigned short op_dir_depth; + }; __u16 op_mirror_id; diff --git a/lustre/include/uapi/linux/lustre/lustre_user.h b/lustre/include/uapi/linux/lustre/lustre_user.h index a245691..d64e888f 100644 --- a/lustre/include/uapi/linux/lustre/lustre_user.h +++ b/lustre/include/uapi/linux/lustre/lustre_user.h @@ -1150,6 +1150,8 @@ enum { LMV_INHERIT_RR_DEFAULT = 0, /* not inherit any more */ LMV_INHERIT_RR_END = 1, + /* default inherit_rr of ROOT */ + LMV_INHERIT_RR_ROOT = 3, /* max inherit depth */ LMV_INHERIT_RR_MAX = 250, /* [251, 254] are reserved */ diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 9e01428..4212db0 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -469,6 +469,8 @@ static int ll_dir_setdirstripe(struct dentry *dparent, struct lmv_user_md *lump, if (IS_ERR(op_data)) RETURN(PTR_ERR(op_data)); + op_data->op_dir_depth = ll_i2info(parent)->lli_depth; + if (ll_sbi_has_encrypt(sbi) && (IS_ENCRYPTED(parent) || unlikely(llcrypt_dummy_context_enabled(parent)))) { diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 41af22a..934be16 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -671,8 +671,11 @@ retry: * of kernel will deal with that later. */ ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, &bits); - if (bits & MDS_INODELOCK_LOOKUP) + if (bits & MDS_INODELOCK_LOOKUP) { d_lustre_revalidate(de); + ll_update_dir_depth(parent->d_inode, de->d_inode); + } + /* if DoM bit returned along with LAYOUT bit then there * can be read-on-open data returned. */ diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index c416226..c28d7ca 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -168,12 +168,14 @@ struct ll_inode_info { /* "opendir_pid" is the token when lookup/revalid * -- I am the owner of dir statahead. */ pid_t lli_opendir_pid; + /* directory depth to ROOT */ + unsigned short lli_depth; /* stat will try to access statahead entries or start * statahead if this flag is set, and this flag will be * set upon dir open, and cleared when dir is closed, * statahead hit ratio is too low, or start statahead * thread failed. */ - unsigned int lli_sa_enabled:1; + unsigned short lli_sa_enabled:1; /* generation for statahead */ unsigned int lli_sa_generation; /* rw lock protects lli_lsm_md */ @@ -1212,6 +1214,7 @@ int ll_statfs_internal(struct ll_sb_info *sbi, struct obd_statfs *osfs, u32 flags); int ll_update_inode(struct inode *inode, struct lustre_md *md); void ll_update_inode_flags(struct inode *inode, unsigned int ext_flags); +void ll_update_dir_depth(struct inode *dir, struct inode *inode); int ll_read_inode2(struct inode *inode, void *opaque); void ll_truncate_inode_pages_final(struct inode *inode); void ll_delete_inode(struct inode *inode); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index f29c319..651ce89 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -2560,6 +2560,22 @@ int ll_update_inode(struct inode *inode, struct lustre_md *md) return 0; } +/* update directory depth to ROOT, called after LOOKUP lock is fetched. */ +void ll_update_dir_depth(struct inode *dir, struct inode *inode) +{ + struct ll_inode_info *lli; + + if (!S_ISDIR(inode->i_mode)) + return; + + if (inode == dir) + return; + + lli = ll_i2info(inode); + lli->lli_depth = ll_i2info(dir)->lli_depth + 1; + CDEBUG(D_INODE, DFID" depth %hu\n", PFID(&lli->lli_fid), lli->lli_depth); +} + void ll_truncate_inode_pages_final(struct inode *inode) { struct address_space *mapping = &inode->i_data; diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 29d4806..9409c66 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -754,8 +754,10 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request, if (!it_disposition(it, DISP_LOOKUP_NEG)) { /* we have lookup look - unhide dentry */ - if (bits & MDS_INODELOCK_LOOKUP) + if (bits & MDS_INODELOCK_LOOKUP) { d_lustre_revalidate(*de); + ll_update_dir_depth(parent, (*de)->d_inode); + } if (encrypt) { rc = llcrypt_get_encryption_info(inode); @@ -1430,8 +1432,10 @@ static int ll_create_it(struct inode *dir, struct dentry *dentry, } ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, inode, it, &bits); - if (bits & MDS_INODELOCK_LOOKUP) + if (bits & MDS_INODELOCK_LOOKUP) { d_lustre_revalidate(dentry); + ll_update_dir_depth(dir, inode); + } RETURN(0); } @@ -1456,6 +1460,58 @@ void ll_update_times(struct ptlrpc_request *request, struct inode *inode) inode->i_ctime.tv_sec = body->mbo_ctime; } +/* once default LMV (space balanced) is set on ROOT, it should take effect if + * default LMV is not set on parent directory. + */ +static void ll_qos_mkdir_prep(struct md_op_data *op_data, struct inode *dir) +{ + struct inode *root = dir->i_sb->s_root->d_inode; + struct ll_inode_info *rlli = ll_i2info(root); + struct ll_inode_info *lli = ll_i2info(dir); + struct lmv_stripe_md *lsm; + + op_data->op_dir_depth = lli->lli_depth; + + /* parent directory is striped */ + if (unlikely(lli->lli_lsm_md)) + return; + + /* default LMV set on parent directory */ + if (unlikely(lli->lli_default_lsm_md)) + return; + + /* parent is ROOT */ + if (unlikely(dir == root)) + return; + + /* default LMV not set on ROOT */ + if (!rlli->lli_default_lsm_md) + return; + + down_read(&rlli->lli_lsm_sem); + lsm = rlli->lli_default_lsm_md; + if (!lsm) + goto unlock; + + /* not space balanced */ + if (lsm->lsm_md_master_mdt_index != LMV_OFFSET_DEFAULT) + goto unlock; + + if (lsm->lsm_md_max_inherit != LMV_INHERIT_NONE && + (lsm->lsm_md_max_inherit == LMV_INHERIT_UNLIMITED || + lsm->lsm_md_max_inherit >= lli->lli_depth)) { + op_data->op_flags |= MF_QOS_MKDIR; + if (lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE && + (lsm->lsm_md_max_inherit_rr == LMV_INHERIT_RR_UNLIMITED || + lsm->lsm_md_max_inherit_rr >= lli->lli_depth)) + op_data->op_flags |= MF_RR_MKDIR; + CDEBUG(D_INODE, DFID" requests qos mkdir %#x\n", + PFID(&lli->lli_fid), op_data->op_flags); + } +unlock: + up_read(&rlli->lli_lsm_sem); +} + static int ll_new_node(struct inode *dir, struct dentry *dchild, const char *tgt, umode_t mode, int rdev, __u32 opc) { @@ -1478,6 +1534,9 @@ again: if (IS_ERR(op_data)) GOTO(err_exit, err = PTR_ERR(op_data)); + if (S_ISDIR(mode)) + ll_qos_mkdir_prep(op_data, dir); + if (sbi->ll_flags & LL_SBI_FILE_SECCTX) { err = ll_dentry_init_security(dchild, mode, &dchild->d_name, &op_data->op_file_secctx_name, diff --git a/lustre/llite/statahead.c b/lustre/llite/statahead.c index 775c488..547285f 100644 --- a/lustre/llite/statahead.c +++ b/lustre/llite/statahead.c @@ -1507,8 +1507,11 @@ static int revalidate_statahead_dentry(struct inode *dir, } if ((bits & MDS_INODELOCK_LOOKUP) && - d_lustre_invalid(*dentryp)) + d_lustre_invalid(*dentryp)) { d_lustre_revalidate(*dentryp); + ll_update_dir_depth(dir, (*dentryp)->d_inode); + } + ll_intent_release(&it); } } diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 292babe..d1330f0 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -1459,7 +1459,8 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, RETURN(rc); } -static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt) +static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt, + unsigned short dir_depth) { struct lu_tgt_desc *tgt, *cur = NULL; __u64 total_avail = 0; @@ -1500,10 +1501,10 @@ static struct lu_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt) /* if current MDT has above-average space, within range of the QOS * threshold, stay on the same MDT to avoid creating needless remote - * MDT directories. + * MDT directories. It's more likely for low level directories. */ rand = total_avail * (256 - lmv->lmv_qos.lq_threshold_rr) / - (total_usable * 256); + (total_usable * 256 * (1 + dir_depth / 4)); if (cur && cur->ltd_qos.ltq_avail >= rand) { tgt = cur; GOTO(unlock, rc = 0); @@ -1751,12 +1752,14 @@ static inline bool lmv_op_default_qos_mkdir(const struct md_op_data *op_data) { const struct lmv_stripe_md *lsm = op_data->op_default_mea1; - return lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT; + return (op_data->op_flags & MF_QOS_MKDIR) || + (lsm && lsm->lsm_md_master_mdt_index == LMV_OFFSET_DEFAULT); } -/* mkdir by QoS in two cases: - * 1. 'lfs mkdir -i -1' - * 2. parent default LMV master_mdt_index is -1 +/* mkdir by QoS in three cases: + * 1. ROOT default LMV is space balanced. + * 2. 'lfs mkdir -i -1' + * 3. parent default LMV master_mdt_index is -1 * * NB, mkdir by QoS only if parent is not striped, this is to avoid remote * directories under striped directory. @@ -1778,11 +1781,12 @@ static inline bool lmv_op_qos_mkdir(const struct md_op_data *op_data) return false; } -/* if default LMV is set, and its index is LMV_OFFSET_DEFAULT, and - * 1. max_inherit_rr is set and is not LMV_INHERIT_RR_NONE +/* if parent default LMV is space balanced, and + * 1. max_inherit_rr is set * 2. or parent is ROOT - * mkdir roundrobin. - * NB, this also needs to check server is balanced, which is checked by caller. + * mkdir roundrobin. Or if parent doesn't have default LMV, while ROOT default + * LMV requests roundrobin mkdir, do the same. + * NB, this needs to check server is balanced, which is done by caller. */ static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data) { @@ -1791,7 +1795,8 @@ static inline bool lmv_op_default_rr_mkdir(const struct md_op_data *op_data) if (!lmv_op_default_qos_mkdir(op_data)) return false; - return lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE || + return (op_data->op_flags & MF_RR_MKDIR) || + (lsm && lsm->lsm_md_max_inherit_rr != LMV_INHERIT_RR_NONE) || fid_is_root(&op_data->op_fid1); } @@ -1868,7 +1873,8 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, } else if (lmv_op_qos_mkdir(op_data)) { struct lmv_tgt_desc *tmp = tgt; - tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds); + tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds, + op_data->op_dir_depth); if (tgt == ERR_PTR(-EAGAIN)) { if (ltd_qos_is_balanced(&lmv->lmv_mdt_descs) && !lmv_op_default_rr_mkdir(op_data) && diff --git a/lustre/lmv/lproc_lmv.c b/lustre/lmv/lproc_lmv.c index feded16..b095aa2 100644 --- a/lustre/lmv/lproc_lmv.c +++ b/lustre/lmv/lproc_lmv.c @@ -120,10 +120,21 @@ static ssize_t qos_prio_free_store(struct kobject *kobj, struct obd_device *obd = container_of(kobj, struct obd_device, obd_kset.kobj); struct lmv_obd *lmv = &obd->u.lmv; + char buf[6], *tmp; unsigned int val; int rc; - rc = kstrtouint(buffer, 0, &val); + /* "100%\n\0" should be largest string */ + if (count >= sizeof(buf)) + return -ERANGE; + + strncpy(buf, buffer, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + tmp = strchr(buf, '%'); + if (tmp) + *tmp = '\0'; + + rc = kstrtouint(buf, 0, &val); if (rc) return rc; @@ -157,10 +168,21 @@ static ssize_t qos_threshold_rr_store(struct kobject *kobj, struct obd_device *obd = container_of(kobj, struct obd_device, obd_kset.kobj); struct lmv_obd *lmv = &obd->u.lmv; + char buf[6], *tmp; unsigned int val; int rc; - rc = kstrtouint(buffer, 0, &val); + /* "100%\n\0" should be largest string */ + if (count >= sizeof(buf)) + return -ERANGE; + + strncpy(buf, buffer, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + tmp = strchr(buf, '%'); + if (tmp) + *tmp = '\0'; + + rc = kstrtouint(buf, 0, &val); if (rc) return rc; diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index e2dd864..ea57141 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -5925,24 +5925,8 @@ static int lod_declare_create(const struct lu_env *env, struct dt_object *dt, if (OBD_FAIL_CHECK(OBD_FAIL_MDS_STALE_DIR_LAYOUT)) GOTO(out, rc = -EREMOTE); - if (lo->ldo_dir_stripe_offset == LMV_OFFSET_DEFAULT) { - struct lod_default_striping *lds; - - lds = lo->ldo_def_striping; - /* - * child and parent should be on the same MDT, - * but if parent has default LMV, and the start - * MDT offset is -1, it's allowed. This check - * is not necessary after 2.12.22 because client - * follows this already, but old client may not. - */ - if (hint->dah_parent && - dt_object_remote(hint->dah_parent) && lds && - lds->lds_dir_def_stripe_offset != - LMV_OFFSET_DEFAULT) - GOTO(out, rc = -EREMOTE); - } else if (lo->ldo_dir_stripe_offset != - ss->ss_node_id) { + if (lo->ldo_dir_stripe_offset != LMV_OFFSET_DEFAULT && + lo->ldo_dir_stripe_offset != ss->ss_node_id) { struct lod_device *lod; struct lu_tgt_desc *mdt = NULL; bool found_mdt = false; diff --git a/lustre/lod/lproc_lod.c b/lustre/lod/lproc_lod.c index 3e72da6..602ce8e 100644 --- a/lustre/lod/lproc_lod.c +++ b/lustre/lod/lproc_lod.c @@ -555,15 +555,27 @@ static ssize_t __qos_prio_free_store(struct kobject *kobj, struct lod_device *lod = dt2lod_dev(dt); struct lu_tgt_descs *ltd = is_mdt ? &lod->lod_mdt_descs : &lod->lod_ost_descs; + char buf[6], *tmp; unsigned int val; int rc; - rc = kstrtouint(buffer, 0, &val); + /* "100%\n\0" should be largest string */ + if (count >= sizeof(buf)) + return -ERANGE; + + strncpy(buf, buffer, sizeof(buf)); + buf[sizeof(buf) - 1] = '\0'; + tmp = strchr(buf, '%'); + if (tmp) + *tmp = '\0'; + + rc = kstrtouint(buf, 0, &val); if (rc) return rc; if (val > 100) return -EINVAL; + ltd->ltd_qos.lq_prio_free = (val << 8) / 100; set_bit(LQ_DIRTY, <d->ltd_qos.lq_flags); set_bit(LQ_RESET, <d->ltd_qos.lq_flags); diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c index 8876830..0e7423b 100644 --- a/lustre/mdd/mdd_device.c +++ b/lustre/mdd/mdd_device.c @@ -1320,6 +1320,8 @@ static int mdd_prepare(const struct lu_env *env, .lum_magic = LMV_USER_MAGIC, .lum_stripe_count = 1, .lum_stripe_offset = LMV_OFFSET_DEFAULT, + .lum_max_inherit = LMV_INHERIT_UNLIMITED, + .lum_max_inherit_rr = LMV_INHERIT_RR_ROOT, }; th = dt_trans_create(env, mdd->mdd_bottom); diff --git a/lustre/tests/pjdfstest.sh b/lustre/tests/pjdfstest.sh index f4350f8..39b4c0f 100644 --- a/lustre/tests/pjdfstest.sh +++ b/lustre/tests/pjdfstest.sh @@ -61,8 +61,8 @@ run_lustre_ext4() { run_pjdfstest $EXT4_MNTPT $pjdfstest $EXT4_LOG log "Run $pjdfstest against lustre filesystem" - run_pjdfstest $MOUNT $pjdfstest $LUSTRE_LOG - + mkdir_on_mdt0 $MOUNT/pjdfstest + run_pjdfstest $MOUNT/pjdfstest $pjdfstest $LUSTRE_LOG } setup_ext4() { diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index cd08cde..82ba090 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -24890,6 +24890,33 @@ test_413c() { } run_test 413c "mkdir with default LMV max inherit rr" +test_413d() { + (( MDSCOUNT >= 2 )) || + skip "We need at least 2 MDTs for this test" + + (( MDS1_VERSION >= $(version_code 2.14.51) )) || + skip "Need server version at least 2.14.51" + + local lmv_qos_threshold_rr + + lmv_qos_threshold_rr=$($LCTL get_param -n lmv.*.qos_threshold_rr | + head -n1) + stack_trap "$LCTL set_param \ + lmv.*.qos_threshold_rr=$lmv_qos_threshold_rr > /dev/null" EXIT + + $LCTL set_param lmv.*.qos_threshold_rr=100 > /dev/null + mkdir -p $DIR/$tdir || error "mkdir $tdir failed" + getfattr -d -m dmv -e hex $DIR/$tdir | grep dmv && + error "$tdir shouldn't have default LMV" + createmany -d $DIR/$tdir/sub $((100 * MDSCOUNT)) || + error "mkdir sub failed" + + local count=$($LFS getstripe -m $DIR/$tdir/* | grep -c ^0) + + (( count == 100 )) || error "$count subdirs on MDT0" +} +run_test 413d "inherit ROOT default LMV" + test_413z() { local pids="" local subdir diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 90031ea..258f347 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -10476,6 +10476,8 @@ mkdir_on_mdt() { shift $((OPTIND - 1)) $LFS mkdir -i $mdt -c 1 $* + # setting default LMV in non-DNE system will cause sanity-quota 41 fail + ((MDSCOUNT < 2)) || $LFS setdirstripe -D -i $mdt -c 1 $* } mkdir_on_mdt0() { -- 1.8.3.1