From 6d296587441d80588340200903027ac4231922cd Mon Sep 17 00:00:00 2001 From: Lai Siyao Date: Fri, 15 Feb 2019 22:07:56 +0800 Subject: [PATCH] LU-11213 lmv: mkdir with balanced space usage If a plain directory default LMV hash type is "space", create subdirs on all MDTs with balanced space usage: * client mkdir allocate FID on MDT with balanced space usage (space QoS code is in next patch). * MDT allows mkdir on different MDT with its parent if it has "space" hash type in default LMV, this is normally rejected because mkdir shouldn't create remote directory. Signed-off-by: Lai Siyao Change-Id: I284e21f334c07462211be4c8e38e965722d1e8a8 Reviewed-on: https://review.whamcloud.com/34360 Reviewed-by: Andreas Dilger Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Hongchao Zhang Reviewed-by: Oleg Drokin --- lustre/include/lustre_lmv.h | 51 ++++++- lustre/llite/dir.c | 5 +- lustre/llite/file.c | 11 +- lustre/llite/llite_internal.h | 7 + lustre/llite/llite_lib.c | 24 ++- lustre/llite/namei.c | 10 +- lustre/lmv/lmv_intent.c | 21 ++- lustre/lmv/lmv_internal.h | 30 +--- lustre/lmv/lmv_obd.c | 334 +++++++++++++++++++++--------------------- lustre/lod/lod_internal.h | 5 +- lustre/lod/lod_lov.c | 2 +- lustre/lod/lod_object.c | 91 ++++++++---- lustre/mdd/mdd_dir.c | 2 +- 13 files changed, 319 insertions(+), 274 deletions(-) diff --git a/lustre/include/lustre_lmv.h b/lustre/include/lustre_lmv.h index e683e6c..a3a5120 100644 --- a/lustre/include/lustre_lmv.h +++ b/lustre/include/lustre_lmv.h @@ -54,6 +54,47 @@ struct lmv_stripe_md { struct lmv_oinfo lsm_md_oinfo[0]; }; +/* NB: LMV_HASH_TYPE_SPACE is set in default LMV only */ +static inline bool lmv_is_known_hash_type(__u32 type) +{ + return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 || + (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS; +} + +static inline bool lmv_dir_striped(const struct lmv_stripe_md *lsm) +{ + return lsm && lsm->lsm_md_magic == LMV_MAGIC; +} + +static inline bool lmv_dir_foreign(const struct lmv_stripe_md *lsm) +{ + return lsm && lsm->lsm_md_magic == LMV_MAGIC_FOREIGN; +} + +static inline bool lmv_dir_migrating(const struct lmv_stripe_md *lsm) +{ + return lmv_dir_striped(lsm) && + lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION; +} + +static inline bool lmv_dir_bad_hash(const struct lmv_stripe_md *lsm) +{ + if (!lmv_dir_striped(lsm)) + return false; + + if (lmv_dir_migrating(lsm) && + lsm->lsm_md_stripe_count - lsm->lsm_md_migrate_offset <= 1) + return false; + + return !lmv_is_known_hash_type(lsm->lsm_md_hash_type); +} + +/* NB, this is checking directory default LMV */ +static inline bool lmv_dir_space_hashed(const struct lmv_stripe_md *lsm) +{ + return lsm && lsm->lsm_md_hash_type == LMV_HASH_TYPE_SPACE; +} + static inline bool lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2) { @@ -74,7 +115,7 @@ lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2) lsm2->lsm_md_pool_name) != 0) return false; - if (lsm1->lsm_md_magic == LMV_MAGIC_V1) { + if (lmv_dir_striped(lsm1)) { for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) { if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid, &lsm2->lsm_md_oinfo[idx].lmo_fid)) @@ -96,7 +137,7 @@ static inline void lsm_md_dump(int mask, const struct lmv_stripe_md *lsm) lsm->lsm_md_layout_version, lsm->lsm_md_migrate_offset, lsm->lsm_md_migrate_hash, lsm->lsm_md_pool_name); - if (lsm->lsm_md_magic != LMV_MAGIC_V1) + if (!lmv_dir_striped(lsm)) return; for (i = 0; i < lsm->lsm_md_stripe_count; i++) @@ -190,12 +231,6 @@ static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type, return idx; } -static inline bool lmv_is_known_hash_type(__u32 type) -{ - return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 || - (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS; -} - static inline bool lmv_magic_supported(__u32 lum_magic) { return lum_magic == LMV_USER_MAGIC || diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 914fff0..06f092e 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -163,8 +163,7 @@ void ll_release_page(struct inode *inode, struct page *page, /* Always remove the page for striped dir, because the page is * built from temporarily in LMV layer */ - if (inode != NULL && S_ISDIR(inode->i_mode) && - ll_i2info(inode)->lli_lsm_md != NULL) { + if (inode && ll_dir_striped(inode)) { __free_page(page); return; } @@ -341,7 +340,7 @@ static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) */ GOTO(out, rc = 0); - if (unlikely(ll_i2info(inode)->lli_lsm_md != NULL)) { + if (unlikely(ll_dir_striped(inode))) { /* * This is only needed for striped dir to fill .., * see lmv_read_page() diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 6f26724..d7c1e9e 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -4216,7 +4216,7 @@ int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum, if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) & OBD_CONNECT2_DIR_MIGRATE)) { if (le32_to_cpu(lum->lum_stripe_count) > 1 || - ll_i2info(child_inode)->lli_lsm_md) { + ll_dir_striped(child_inode)) { CERROR("%s: MDT doesn't support stripe directory " "migration!\n", ll_i2sbi(parent)->ll_fsname); GOTO(out_iput, rc = -EOPNOTSUPP); @@ -4403,8 +4403,7 @@ static int ll_inode_revalidate_fini(struct inode *inode, int rc) /* If it is striped directory, and there is bad stripe * Let's revalidate the dentry again, instead of returning * error */ - if (S_ISDIR(inode->i_mode) && - ll_i2info(inode)->lli_lsm_md != NULL) + if (ll_dir_striped(inode)) return 0; /* This path cannot be hit for regular files unless in @@ -4481,8 +4480,7 @@ static int ll_merge_md_attr(struct inode *inode) LASSERT(lli->lli_lsm_md != NULL); - /* foreign dir is not striped dir */ - if (lli->lli_lsm_md->lsm_md_magic == LMV_MAGIC_FOREIGN) + if (!lmv_dir_striped(lli->lli_lsm_md)) RETURN(0); down_read(&lli->lli_lsm_sem); @@ -4550,8 +4548,7 @@ int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat) } } else { /* If object isn't regular a file then don't validate size. */ - if (S_ISDIR(inode->i_mode) && - lli->lli_lsm_md != NULL) { + if (ll_dir_striped(inode)) { rc = ll_merge_md_attr(inode); if (rc < 0) RETURN(rc); diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index e824afb..9ff4cc0 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -1165,6 +1165,13 @@ static inline struct lu_fid *ll_inode2fid(struct inode *inode) return fid; } +static inline bool ll_dir_striped(struct inode *inode) +{ + LASSERT(inode); + return S_ISDIR(inode->i_mode) && + lmv_dir_striped(ll_i2info(inode)->lli_lsm_md); +} + static inline loff_t ll_file_maxbytes(struct inode *inode) { struct cl_object *obj = ll_i2info(inode)->lli_clob; diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 8e6512b..cc84e32 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -1367,6 +1367,9 @@ static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md) ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid)); lsm_md_dump(D_INODE, lsm); + if (!lmv_dir_striped(lsm)) + goto out; + /* XXX sigh, this lsm_root initialization should be in * LMV layer, but it needs ll_iget right now, so we * put this here right now. */ @@ -1394,7 +1397,7 @@ static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md) return rc; } } - +out: lli->lli_lsm_md = lsm; return 0; @@ -1478,10 +1481,9 @@ static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md) * * foreign LMV should not change. */ - if (lli->lli_lsm_md && - lli->lli_lsm_md->lsm_md_magic != LMV_MAGIC_FOREIGN && - !lsm_md_eq(lli->lli_lsm_md, lsm)) { - if (lsm->lsm_md_layout_version <= + if (lli->lli_lsm_md && !lsm_md_eq(lli->lli_lsm_md, lsm)) { + if (lmv_dir_striped(lli->lli_lsm_md) && + lsm->lsm_md_layout_version <= lli->lli_lsm_md->lsm_md_layout_version) { CERROR("%s: "DFID" dir layout mismatch:\n", ll_i2sbi(inode)->ll_fsname, @@ -1501,15 +1503,6 @@ static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md) if (!lli->lli_lsm_md) { struct cl_attr *attr; - if (lsm->lsm_md_magic == LMV_MAGIC_FOREIGN) { - /* set md->lmv to NULL, so the following free lustre_md - * will not free this lsm */ - md->lmv = NULL; - lli->lli_lsm_md = lsm; - up_write(&lli->lli_lsm_sem); - RETURN(0); - } - rc = ll_init_lsm_md(inode, md); up_write(&lli->lli_lsm_sem); if (rc != 0) @@ -1525,6 +1518,9 @@ static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md) */ down_read(&lli->lli_lsm_sem); + if (!lmv_dir_striped(lli->lli_lsm_md)) + GOTO(unlock, rc); + OBD_ALLOC_PTR(attr); if (attr == NULL) GOTO(unlock, rc = -ENOMEM); diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 27e7bff..4734210 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -230,6 +230,7 @@ int ll_dom_lock_cancel(struct inode *inode, struct ldlm_lock *lock) void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel) { struct inode *inode = ll_inode_from_resource_lock(lock); + struct ll_inode_info *lli; __u64 bits = to_cancel; int rc; @@ -317,15 +318,12 @@ void ll_lock_cancel_bits(struct ldlm_lock *lock, __u64 to_cancel) PFID(ll_inode2fid(inode)), rc); } - if (bits & MDS_INODELOCK_UPDATE) { - struct ll_inode_info *lli = ll_i2info(inode); + lli = ll_i2info(inode); + if (bits & MDS_INODELOCK_UPDATE) lli->lli_update_atime = 1; - } if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) { - struct ll_inode_info *lli = ll_i2info(inode); - CDEBUG(D_INODE, "invalidating inode "DFID" lli = %p, " "pfid = "DFID"\n", PFID(ll_inode2fid(inode)), lli, PFID(&lli->lli_pfid)); @@ -702,7 +700,7 @@ static int ll_lookup_it_finish(struct ptlrpc_request *request, struct lu_fid fid = ll_i2info(parent)->lli_fid; /* If it is striped directory, get the real stripe parent */ - if (unlikely(ll_i2info(parent)->lli_lsm_md != NULL)) { + if (unlikely(ll_dir_striped(parent))) { rc = md_get_fid_from_lsm(ll_i2mdexp(parent), ll_i2info(parent)->lli_lsm_md, (*de)->d_name.name, diff --git a/lustre/lmv/lmv_intent.c b/lustre/lmv/lmv_intent.c index 75be491..2ee7aa9 100644 --- a/lustre/lmv/lmv_intent.c +++ b/lustre/lmv/lmv_intent.c @@ -292,16 +292,15 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, ENTRY; /* do not allow file creation in foreign dir */ - if ((it->it_op & IT_CREAT) && op_data->op_mea1 != NULL && - op_data->op_mea1->lsm_md_magic == LMV_MAGIC_FOREIGN) + if ((it->it_op & IT_CREAT) && lmv_dir_foreign(op_data->op_mea1)) RETURN(-ENODATA); if ((it->it_op & IT_CREAT) && !(flags & MDS_OPEN_BY_FID)) { /* don't allow create under dir with bad hash */ - if (lmv_is_dir_bad_hash(op_data->op_mea1)) + if (lmv_dir_bad_hash(op_data->op_mea1)) RETURN(-EBADF); - if (lmv_is_dir_migrating(op_data->op_mea1)) { + if (lmv_dir_migrating(op_data->op_mea1)) { if (flags & O_EXCL) { /* * open(O_CREAT | O_EXCL) needs to check @@ -310,8 +309,7 @@ static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, * file under old layout, check old layout on * client side. */ - tgt = lmv_locate_tgt(lmv, op_data, - &op_data->op_fid1); + tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -345,7 +343,7 @@ retry: /* for striped directory, we can't know parent stripe fid * without name, but we can set it to child fid, and MDT * will obtain it from linkea in open in such case. */ - if (op_data->op_mea1 != NULL) + if (lmv_dir_striped(op_data->op_mea1)) op_data->op_fid1 = op_data->op_fid2; tgt = lmv_find_target(lmv, &op_data->op_fid2); @@ -358,7 +356,7 @@ retry: LASSERT(fid_is_zero(&op_data->op_fid2)); LASSERT(op_data->op_name != NULL); - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); } @@ -443,8 +441,7 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data, ENTRY; /* foreign dir is not striped */ - if (op_data->op_mea1 && - op_data->op_mea1->lsm_md_magic == LMV_MAGIC_FOREIGN) { + if (lmv_dir_foreign(op_data->op_mea1)) { /* only allow getattr/lookup for itself */ if (op_data->op_name != NULL) RETURN(-ENODATA); @@ -452,7 +449,7 @@ lmv_intent_lookup(struct obd_export *exp, struct md_op_data *op_data, } retry: - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -475,7 +472,7 @@ retry: if (*reqp == NULL) { /* If RPC happens, lsm information will be revalidated * during update_inode process (see ll_update_lsm_md) */ - if (op_data->op_mea2 != NULL) { + if (lmv_dir_striped(op_data->op_mea2)) { rc = lmv_revalidate_slaves(exp, op_data->op_mea2, cb_blocking, extra_lock_flags); diff --git a/lustre/lmv/lmv_internal.h b/lustre/lmv/lmv_internal.h index 44debf0..7017e94 100644 --- a/lustre/lmv/lmv_internal.h +++ b/lustre/lmv/lmv_internal.h @@ -136,6 +136,8 @@ lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name, __u32 stripe_count = lsm->lsm_md_stripe_count; int stripe_index; + LASSERT(lmv_dir_striped(lsm)); + if (hash_type & LMV_HASH_FLAG_MIGRATION) { if (post_migrate) { hash_type &= ~LMV_HASH_FLAG_MIGRATION; @@ -166,26 +168,6 @@ lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name, return &lsm->lsm_md_oinfo[stripe_index]; } -static inline bool lmv_is_dir_migrating(const struct lmv_stripe_md *lsm) -{ - return lsm ? lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION : false; -} - -static inline bool lmv_is_dir_bad_hash(const struct lmv_stripe_md *lsm) -{ - if (!lsm) - return false; - - if (lmv_is_dir_migrating(lsm)) { - if (lsm->lsm_md_stripe_count - lsm->lsm_md_migrate_offset > 1) - return !lmv_is_known_hash_type( - lsm->lsm_md_migrate_hash); - return false; - } - - return !lmv_is_known_hash_type(lsm->lsm_md_hash_type); -} - static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data) { const struct lmv_stripe_md *lsm = op_data->op_mea1; @@ -193,12 +175,12 @@ static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data) if (!lsm) return false; - if (lmv_is_dir_migrating(lsm) && !op_data->op_post_migrate) { + if (lmv_dir_migrating(lsm) && !op_data->op_post_migrate) { op_data->op_post_migrate = true; return true; } - if (lmv_is_dir_bad_hash(lsm) && + if (lmv_dir_bad_hash(lsm) && op_data->op_stripe_index < lsm->lsm_md_stripe_count - 1) { op_data->op_stripe_index++; return true; @@ -208,8 +190,8 @@ static inline bool lmv_dir_retry_check_update(struct md_op_data *op_data) } struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv, - struct md_op_data *op_data, - struct lu_fid *fid); + struct md_op_data *op_data); + /* lproc_lmv.c */ int lmv_tunables_init(struct obd_device *obd); diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index da0a7102..440bceb 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -1158,26 +1158,26 @@ hsm_req_err: /** * This is _inode_ placement policy function (not name). */ -static int lmv_placement_policy(struct obd_device *obd, - struct md_op_data *op_data, u32 *mds) +static u32 lmv_placement_policy(struct obd_device *obd, + struct md_op_data *op_data) { - struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_obd *lmv = &obd->u.lmv; struct lmv_user_md *lum; + u32 mdt; ENTRY; - LASSERT(mds != NULL); - - if (lmv->desc.ld_tgt_count == 1) { - *mds = 0; + if (lmv->desc.ld_tgt_count == 1) RETURN(0); - } lum = op_data->op_data; - /* Choose MDS by + /* + * Choose MDT by * 1. See if the stripe offset is specified by lum. - * 2. Then check if there is default stripe offset. - * 3. Finally choose MDS by name hash if the parent + * 2. If parent has default LMV, and its hash type is "space", choose + * MDT with QoS. (see lmv_locate_tgt_qos()). + * 3. Then check if default LMV stripe offset is not -1. + * 4. Finally choose MDS by name hash if the parent * is striped directory. (see lmv_locate_tgt()). * * presently explicit MDT location is not supported @@ -1188,18 +1188,22 @@ static int lmv_placement_policy(struct obd_device *obd, if (op_data->op_cli_flags & CLI_SET_MEA && lum != NULL && le32_to_cpu(lum->lum_magic != LMV_MAGIC_FOREIGN) && le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) { - *mds = le32_to_cpu(lum->lum_stripe_offset); + mdt = le32_to_cpu(lum->lum_stripe_offset); + } else if (op_data->op_code == LUSTRE_OPC_MKDIR && + !lmv_dir_striped(op_data->op_mea1) && + lmv_dir_space_hashed(op_data->op_default_mea1)) { + mdt = op_data->op_mds; } else if (op_data->op_code == LUSTRE_OPC_MKDIR && op_data->op_default_mea1 && op_data->op_default_mea1->lsm_md_master_mdt_index != - (__u32)-1) { - *mds = op_data->op_default_mea1->lsm_md_master_mdt_index; - op_data->op_mds = *mds; + (__u32)-1) { + mdt = op_data->op_default_mea1->lsm_md_master_mdt_index; + op_data->op_mds = mdt; } else { - *mds = op_data->op_mds; + mdt = op_data->op_mds; } - RETURN(0); + RETURN(mdt); } int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds) @@ -1230,38 +1234,32 @@ int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds) rc = 0; } - EXIT; + EXIT; out: mutex_unlock(&tgt->ltd_fid_mutex); - return rc; + return rc; } int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, struct lu_fid *fid, struct md_op_data *op_data) { - struct obd_device *obd = class_exp2obd(exp); - struct lmv_obd *lmv = &obd->u.lmv; - u32 mds = 0; - int rc; - ENTRY; + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + u32 mds; + int rc; - LASSERT(op_data != NULL); - LASSERT(fid != NULL); + ENTRY; - rc = lmv_placement_policy(obd, op_data, &mds); - if (rc) { - CERROR("Can't get target for allocating fid, " - "rc %d\n", rc); - RETURN(rc); - } + LASSERT(op_data != NULL); + LASSERT(fid != NULL); - rc = __lmv_fid_alloc(lmv, fid, mds); - if (rc) { - CERROR("Can't alloc new fid, rc %d\n", rc); - RETURN(rc); - } + mds = lmv_placement_policy(obd, op_data); - RETURN(rc); + rc = __lmv_fid_alloc(lmv, fid, mds); + if (rc) + CERROR("Can't alloc new fid, rc %d\n", rc); + + RETURN(rc); } static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) @@ -1615,20 +1613,30 @@ static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, RETURN(rc); } -struct lmv_tgt_desc* -__lmv_locate_tgt(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, - const char *name, int namelen, struct lu_fid *fid, u32 *mds, - bool post_migrate) +static struct lmv_tgt_desc *lmv_locate_tgt_qos(struct lmv_obd *lmv, __u32 *mdt) +{ + static unsigned int rr_index; + + /* locate MDT round-robin is the first step */ + *mdt = rr_index % lmv->tgts_size; + rr_index++; + + return lmv->tgts[*mdt]; +} + +static struct lmv_tgt_desc * +lmv_locate_tgt_by_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, + const char *name, int namelen, struct lu_fid *fid, + __u32 *mds, bool post_migrate) { struct lmv_tgt_desc *tgt; const struct lmv_oinfo *oinfo; - if (lsm == NULL || namelen == 0) { + if (!lmv_dir_striped(lsm) || !namelen) { tgt = lmv_find_target(lmv, fid); if (IS_ERR(tgt)) return tgt; - LASSERT(mds); *mds = tgt->ltd_idx; return tgt; } @@ -1644,48 +1652,41 @@ __lmv_locate_tgt(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, return ERR_CAST(oinfo); } - if (fid != NULL) - *fid = oinfo->lmo_fid; - if (mds != NULL) - *mds = oinfo->lmo_mds; - + *fid = oinfo->lmo_fid; + *mds = oinfo->lmo_mds; tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL); - CDEBUG(D_INFO, "locate on mds %u "DFID"\n", oinfo->lmo_mds, - PFID(&oinfo->lmo_fid)); + CDEBUG(D_INODE, "locate MDT %u parent "DFID"\n", *mds, PFID(fid)); return tgt; } - /** - * Locate mdt by fid or name + * Locate MDT of op_data->op_fid1 * * For striped directory, it will locate the stripe by name hash, if hash_type * is unknown, it will return the stripe specified by 'op_data->op_stripe_index' * which is set outside, and if dir is migrating, 'op_data->op_post_migrate' * indicates whether old or new layout is used to locate. * - * For normal direcotry, it will locate MDS by FID directly. + * For plain direcotry, normally it will locate MDT by FID, but if this + * directory has default LMV, and its hash type is "space", locate MDT with QoS. * * \param[in] lmv LMV device * \param[in] op_data client MD stack parameters, name, namelen * mds_num etc. - * \param[in] fid object FID used to locate MDS. * * retval pointer to the lmv_tgt_desc if succeed. * ERR_PTR(errno) if failed. */ -struct lmv_tgt_desc* -lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data, - struct lu_fid *fid) +struct lmv_tgt_desc * +lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data) { struct lmv_stripe_md *lsm = op_data->op_mea1; struct lmv_oinfo *oinfo; struct lmv_tgt_desc *tgt; - /* foreign dir is not striped dir */ - if (lsm && lsm->lsm_md_magic == LMV_MAGIC_FOREIGN) + if (lmv_dir_foreign(lsm)) return ERR_PTR(-ENODATA); /* During creating VOLATILE file, it should honor the mdt @@ -1697,62 +1698,122 @@ lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data, if (IS_ERR(tgt)) return tgt; - if (lsm) { + if (lmv_dir_striped(lsm)) { int i; /* refill the right parent fid */ for (i = 0; i < lsm->lsm_md_stripe_count; i++) { oinfo = &lsm->lsm_md_oinfo[i]; if (oinfo->lmo_mds == op_data->op_mds) { - *fid = oinfo->lmo_fid; + op_data->op_fid1 = oinfo->lmo_fid; break; } } if (i == lsm->lsm_md_stripe_count) - *fid = lsm->lsm_md_oinfo[0].lmo_fid; + op_data->op_fid1 = lsm->lsm_md_oinfo[0].lmo_fid; } - } else if (lmv_is_dir_bad_hash(lsm)) { + } else if (lmv_dir_bad_hash(lsm)) { LASSERT(op_data->op_stripe_index < lsm->lsm_md_stripe_count); oinfo = &lsm->lsm_md_oinfo[op_data->op_stripe_index]; - *fid = oinfo->lmo_fid; + op_data->op_fid1 = oinfo->lmo_fid; op_data->op_mds = oinfo->lmo_mds; tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL); + } else if (op_data->op_code == LUSTRE_OPC_MKDIR && + lmv_dir_space_hashed(op_data->op_default_mea1) && + !lmv_dir_striped(lsm)) { + tgt = lmv_locate_tgt_qos(lmv, &op_data->op_mds); + /* + * only update statfs when mkdir under dir with "space" hash, + * this means the cached statfs may be stale, and current mkdir + * may not follow QoS accurately, but it's not serious, and it + * avoids periodic statfs when client doesn't mkdir under + * "space" hashed directories. + */ + if (!IS_ERR(tgt)) { + struct obd_device *obd; + + obd = container_of(lmv, struct obd_device, u.lmv); + lmv_statfs_check_update(obd, tgt); + } } else { - tgt = __lmv_locate_tgt(lmv, lsm, op_data->op_name, - op_data->op_namelen, fid, - &op_data->op_mds, - op_data->op_post_migrate); + tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea1, + op_data->op_name, op_data->op_namelen, + &op_data->op_fid1, &op_data->op_mds, + op_data->op_post_migrate); } return tgt; } +/* Locate MDT of op_data->op_fid2 for link/rename */ +static struct lmv_tgt_desc * +lmv_locate_tgt2(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + struct lmv_tgt_desc *tgt; + int rc; + + LASSERT(op_data->op_name); + if (lmv_dir_migrating(op_data->op_mea2)) { + struct lu_fid fid1 = op_data->op_fid1; + struct lmv_stripe_md *lsm1 = op_data->op_mea1; + struct ptlrpc_request *request = NULL; + + /* + * avoid creating new file under old layout of migrating + * directory, check it here. + */ + tgt = lmv_locate_tgt_by_name(lmv, op_data->op_mea2, + op_data->op_name, op_data->op_namelen, + &op_data->op_fid2, &op_data->op_mds, false); + if (IS_ERR(tgt)) + RETURN(tgt); + + op_data->op_fid1 = op_data->op_fid2; + op_data->op_mea1 = op_data->op_mea2; + rc = md_getattr_name(tgt->ltd_exp, op_data, &request); + op_data->op_fid1 = fid1; + op_data->op_mea1 = lsm1; + if (!rc) { + ptlrpc_req_finished(request); + RETURN(ERR_PTR(-EEXIST)); + } + + if (rc != -ENOENT) + RETURN(ERR_PTR(rc)); + } + + return lmv_locate_tgt_by_name(lmv, op_data->op_mea2, op_data->op_name, + op_data->op_namelen, &op_data->op_fid2, + &op_data->op_mds, true); +} + int lmv_create(struct obd_export *exp, struct md_op_data *op_data, const void *data, size_t datalen, umode_t mode, uid_t uid, gid_t gid, cfs_cap_t cap_effective, __u64 rdev, struct ptlrpc_request **request) { - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - int rc; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; if (!lmv->desc.ld_active_tgt_count) RETURN(-EIO); - if (lmv_is_dir_bad_hash(op_data->op_mea1)) + if (lmv_dir_bad_hash(op_data->op_mea1)) RETURN(-EBADF); - if (lmv_is_dir_migrating(op_data->op_mea1)) { + if (lmv_dir_migrating(op_data->op_mea1)) { /* * if parent is migrating, create() needs to lookup existing * name, to avoid creating new file under old layout of * migrating directory, check old layout here. */ - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -1769,7 +1830,7 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, op_data->op_post_migrate = true; } - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -1789,8 +1850,6 @@ int lmv_create(struct obd_export *exp, struct md_op_data *op_data, RETURN(PTR_ERR(tgt)); op_data->op_mds = tgt->ltd_idx; - } else { - CDEBUG(D_CONFIG, "Server doesn't support striped dirs\n"); } CDEBUG(D_INODE, "CREATE obj "DFID" -> mds #%x\n", @@ -1846,7 +1905,7 @@ lmv_getattr_name(struct obd_export *exp,struct md_op_data *op_data, ENTRY; retry: - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -1947,39 +2006,7 @@ static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); op_data->op_cap = cfs_curproc_cap_pack(); - if (lmv_is_dir_migrating(op_data->op_mea2)) { - struct lu_fid fid1 = op_data->op_fid1; - struct lmv_stripe_md *lsm1 = op_data->op_mea1; - - /* - * avoid creating new file under old layout of migrating - * directory, check it here. - */ - tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name, - op_data->op_namelen, &op_data->op_fid2, - &op_data->op_mds, false); - tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(tgt)) - RETURN(PTR_ERR(tgt)); - - op_data->op_fid1 = op_data->op_fid2; - op_data->op_mea1 = op_data->op_mea2; - rc = md_getattr_name(tgt->ltd_exp, op_data, request); - op_data->op_fid1 = fid1; - op_data->op_mea1 = lsm1; - if (!rc) { - ptlrpc_req_finished(*request); - *request = NULL; - RETURN(-EEXIST); - } - - if (rc != -ENOENT) - RETURN(rc); - } - - tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, op_data->op_name, - op_data->op_namelen, &op_data->op_fid2, - &op_data->op_mds, true); + tgt = lmv_locate_tgt2(lmv, op_data); if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -2027,7 +2054,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, if (IS_ERR(parent_tgt)) RETURN(PTR_ERR(parent_tgt)); - if (lsm) { + if (lmv_dir_striped(lsm)) { __u32 hash_type = lsm->lsm_md_hash_type; __u32 stripe_count = lsm->lsm_md_stripe_count; @@ -2035,7 +2062,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, * old stripes are appended after new stripes for migrating * directory. */ - if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) { + if (lmv_dir_migrating(lsm)) { hash_type = lsm->lsm_md_migrate_hash; stripe_count -= lsm->lsm_md_migrate_offset; } @@ -2045,7 +2072,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, if (rc < 0) RETURN(rc); - if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) + if (lmv_dir_migrating(lsm)) rc += lsm->lsm_md_migrate_offset; /* save it in fid4 temporarily for early cancel */ @@ -2059,7 +2086,7 @@ static int lmv_migrate(struct obd_export *exp, struct md_op_data *op_data, * if parent is being migrated too, fill op_fid2 with target * stripe fid, otherwise the target stripe is not created yet. */ - if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION) { + if (lmv_dir_migrating(lsm)) { hash_type = lsm->lsm_md_hash_type & ~LMV_HASH_FLAG_MIGRATION; stripe_count = lsm->lsm_md_migrate_offset; @@ -2188,44 +2215,10 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); op_data->op_cap = cfs_curproc_cap_pack(); - if (lmv_is_dir_migrating(op_data->op_mea2)) { - struct lu_fid fid1 = op_data->op_fid1; - struct lmv_stripe_md *lsm1 = op_data->op_mea1; + op_data->op_name = new; + op_data->op_namelen = newlen; - /* - * we avoid creating new file under old layout of migrating - * directory, if there is an existing file with new name under - * old layout, we can't unlink file in old layout and rename to - * new layout in one transaction, so return -EBUSY here.` - */ - tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen, - &op_data->op_fid2, &op_data->op_mds, - false); - if (IS_ERR(tgt)) - RETURN(PTR_ERR(tgt)); - - op_data->op_fid1 = op_data->op_fid2; - op_data->op_mea1 = op_data->op_mea2; - op_data->op_name = new; - op_data->op_namelen = newlen; - rc = md_getattr_name(tgt->ltd_exp, op_data, request); - op_data->op_fid1 = fid1; - op_data->op_mea1 = lsm1; - op_data->op_name = NULL; - op_data->op_namelen = 0; - if (!rc) { - ptlrpc_req_finished(*request); - *request = NULL; - RETURN(-EBUSY); - } - - if (rc != -ENOENT) - RETURN(rc); - } - - /* rename to new layout for migrating directory */ - tp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea2, new, newlen, - &op_data->op_fid2, &op_data->op_mds, true); + tp_tgt = lmv_locate_tgt2(lmv, op_data); if (IS_ERR(tp_tgt)) RETURN(PTR_ERR(tp_tgt)); @@ -2275,10 +2268,10 @@ static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, RETURN(rc); } + op_data->op_name = old; + op_data->op_namelen = oldlen; retry: - sp_tgt = __lmv_locate_tgt(lmv, op_data->op_mea1, old, oldlen, - &op_data->op_fid1, &op_data->op_mds, - op_data->op_post_migrate); + sp_tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(sp_tgt)) RETURN(PTR_ERR(sp_tgt)); @@ -2748,18 +2741,17 @@ int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data, struct md_callback *cb_op, __u64 offset, struct page **ppage) { - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_stripe_md *lsm = op_data->op_mea1; - struct lmv_tgt_desc *tgt; - int rc; + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int rc; + ENTRY; - if (unlikely(lsm != NULL)) { - /* foreign dir is not striped dir */ - if (lsm->lsm_md_magic == LMV_MAGIC_FOREIGN) - return -ENODATA; + if (unlikely(lmv_dir_foreign(op_data->op_mea1))) + RETURN(-ENODATA); + if (unlikely(lmv_dir_striped(op_data->op_mea1))) { rc = lmv_striped_read_page(exp, op_data, cb_op, offset, ppage); RETURN(rc); } @@ -2814,7 +2806,7 @@ static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data, op_data->op_cap = cfs_curproc_cap_pack(); retry: - parent_tgt = lmv_locate_tgt(lmv, op_data, &op_data->op_fid1); + parent_tgt = lmv_locate_tgt(lmv, op_data); if (IS_ERR(parent_tgt)) RETURN(PTR_ERR(parent_tgt)); @@ -3110,7 +3102,7 @@ static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp, RETURN(0); } - if (lsm->lsm_md_magic == LMV_MAGIC) { + if (lmv_dir_striped(lsm)) { for (i = 0; i < lsm->lsm_md_stripe_count; i++) { if (lsm->lsm_md_oinfo[i].lmo_root) iput(lsm->lsm_md_oinfo[i].lmo_root); @@ -3402,7 +3394,8 @@ int lmv_get_fid_from_lsm(struct obd_export *exp, { const struct lmv_oinfo *oinfo; - LASSERT(lsm != NULL); + LASSERT(lmv_dir_striped(lsm)); + oinfo = lsm_name_to_stripe_info(lsm, name, namelen, false); if (IS_ERR(oinfo)) return PTR_ERR(oinfo); @@ -3473,8 +3466,7 @@ static int lmv_merge_attr(struct obd_export *exp, int rc; int i; - /* foreign dir is not striped dir */ - if (lsm->lsm_md_magic == LMV_MAGIC_FOREIGN) + if (!lmv_dir_striped(lsm)) return 0; rc = lmv_revalidate_slaves(exp, lsm, cb_blocking, 0); diff --git a/lustre/lod/lod_internal.h b/lustre/lod/lod_internal.h index 46fc4c0..a9f5ffe 100644 --- a/lustre/lod/lod_internal.h +++ b/lustre/lod/lod_internal.h @@ -327,8 +327,9 @@ struct lod_object { /* foreign directory */ ldo_dir_is_foreign; /* - * default striping is not cached, so this field is - * invalid after create, make sure it's used by + * This default LMV is parent default LMV, which will be + * used in child creation, and it's not cached, so this + * field is invalid after create, make sure it's used by * lod_dir_striping_create_internal() only. */ struct lod_default_striping *ldo_def_striping; diff --git a/lustre/lod/lod_lov.c b/lustre/lod/lod_lov.c index c139e49..c711e39 100644 --- a/lustre/lod/lod_lov.c +++ b/lustre/lod/lod_lov.c @@ -1526,7 +1526,7 @@ int lod_striping_load(const struct lu_env *env, struct lod_object *lo) } } - if (rc < (typeof(rc))sizeof(struct lmv_mds_md_v1)) { + if (rc < (int)sizeof(struct lmv_mds_md_v1)) { /* Let's set stripe_loaded to avoid further * stripe loading especially for non-stripe directory, * which can hurt performance. (See LU-9840) diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index 0304a05..cfa8d03 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -3672,8 +3672,9 @@ static int lod_xattr_set_default_lmv_on_dir(const struct lu_env *env, const char *name, int fl, struct thandle *th) { - struct lmv_user_md_v1 *lum; - int rc; + struct lmv_user_md_v1 *lum; + int rc; + ENTRY; LASSERT(buf != NULL && buf->lb_buf != NULL); @@ -4558,13 +4559,12 @@ static int lod_get_default_lov_striping(const struct lu_env *env, struct lov_user_md_v1 *v1 = NULL; struct lov_user_md_v3 *v3 = NULL; struct lov_comp_md_v1 *comp_v1 = NULL; - __u16 comp_cnt; - __u16 mirror_cnt; - bool composite; - int rc, i, j; - ENTRY; + __u16 comp_cnt; + __u16 mirror_cnt; + bool composite; + int rc, i, j; - lds->lds_def_striping_set = 0; + ENTRY; rc = lod_get_lov_ea(env, lo); if (rc < 0) @@ -4695,27 +4695,30 @@ static int lod_get_default_lmv_striping(const struct lu_env *env, struct lod_object *lo, struct lod_default_striping *lds) { - struct lod_thread_info *info = lod_env_info(env); - struct lmv_user_md_v1 *v1 = NULL; - int rc; - ENTRY; + struct lmv_user_md *lmu; + int rc; lds->lds_dir_def_striping_set = 0; + rc = lod_get_default_lmv_ea(env, lo); if (rc < 0) - RETURN(rc); + return rc; - if (rc < (typeof(rc))sizeof(struct lmv_user_md)) - RETURN(0); + if (rc >= (int)sizeof(*lmu)) { + struct lod_thread_info *info = lod_env_info(env); - v1 = info->lti_ea_store; + lmu = info->lti_ea_store; - lds->lds_dir_def_stripe_count = le32_to_cpu(v1->lum_stripe_count); - lds->lds_dir_def_stripe_offset = le32_to_cpu(v1->lum_stripe_offset); - lds->lds_dir_def_hash_type = le32_to_cpu(v1->lum_hash_type); - lds->lds_dir_def_striping_set = 1; + lds->lds_dir_def_stripe_count = + le32_to_cpu(lmu->lum_stripe_count); + lds->lds_dir_def_stripe_offset = + le32_to_cpu(lmu->lum_stripe_offset); + lds->lds_dir_def_hash_type = + le32_to_cpu(lmu->lum_hash_type); + lds->lds_dir_def_striping_set = 1; + } - RETURN(0); + return 0; } /** @@ -5238,6 +5241,36 @@ out: RETURN(rc); } +static inline int dt_object_space_hashed(const struct lu_env *env, + struct lu_device *dev, + struct dt_object *dt) +{ + struct lu_object *obj; + struct lod_object *lo; + struct lmv_user_md *lmu; + int rc = 0; + + obj = lu_object_find_slice(env, dev, lu_object_fid(&dt->do_lu), NULL); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + lo = lu2lod_obj(obj); + + rc = lod_get_default_lmv_ea(env, lo); + if (rc < 0) + return rc; + + if (rc >= (int)sizeof(*lmu)) { + struct lod_thread_info *info = lod_env_info(env); + + lmu = info->lti_ea_store; + rc = le32_to_cpu(lmu->lum_hash_type) == LMV_HASH_TYPE_SPACE; + } + dt_object_put(env, dt); + + return rc; +} + /** * Implementation of dt_object_operations::do_declare_create. * @@ -5302,10 +5335,18 @@ static int lod_declare_create(const struct lu_env *env, struct dt_object *dt, GOTO(out, rc = -EREMOTE); if (lo->ldo_dir_stripe_offset == -1) { - /* child and parent should be in the same MDT */ - if (hint->dah_parent != NULL && - dt_object_remote(hint->dah_parent)) - GOTO(out, rc = -EREMOTE); + /* + * child and parent should be in the same MDT, + * but if parent has plain layout, it's allowed. + */ + if (hint->dah_parent && + dt_object_remote(hint->dah_parent)) { + rc = dt_object_space_hashed(env, + lo->ldo_obj.do_lu.lo_dev, + hint->dah_parent); + if (rc <= 0) + GOTO(out, rc ? rc : -EREMOTE); + } } else if (lo->ldo_dir_stripe_offset != ss->ss_node_id) { struct lod_device *lod; diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c index 551c370..df65484 100644 --- a/lustre/mdd/mdd_dir.c +++ b/lustre/mdd/mdd_dir.c @@ -4501,7 +4501,7 @@ static int mdd_migrate(const struct lu_env *env, struct md_object *md_pobj, if (rc) GOTO(out, rc); - mdd_object_make_hint(env, NULL, tobj, attr, spec, hint); + mdd_object_make_hint(env, tpobj, tobj, attr, spec, hint); handle = mdd_trans_create(env, mdd); if (IS_ERR(handle)) -- 1.8.3.1