From f238540c879dc668e18cf99cba62f117ccae64d6 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Fri, 10 Jun 2022 19:47:00 -0600 Subject: [PATCH] LU-15913 mdt: disable parallel rename for striped dirs Parallel rename should not be done within striped directories to avoid remote updates. These are like cross-directory renames. Add tunables for parallel directory rename in case of problems. These can be configured separately for files and directories. mdt.*.enable_parallel_rename_dir mdt.*.enable_parallel_rename_file Fixes: 90979ab390 ("LU-12125 mds: allow parallel directory rename") Fixes: d76cc65d5d ("LU-12125 mds: allow parallel regular file rename") Signed-off-by: Andreas Dilger Change-Id: I384976cd1c9f401169336ee7a479ba0e3dd9f4ee Reviewed-on: https://review.whamcloud.com/47593 Reviewed-by: Artem Blagodarenko Reviewed-by: Lai Siyao Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/include/lprocfs_status.h | 12 ------ lustre/mdt/mdt_handler.c | 11 +++-- lustre/mdt/mdt_internal.h | 52 +++++++++++++++-------- lustre/mdt/mdt_lproc.c | 92 +++++++++++++++++++++++++++++++++++++---- lustre/mdt/mdt_reint.c | 26 ++++++++---- 5 files changed, 143 insertions(+), 50 deletions(-) diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index 85fab90..64c56e2 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -107,18 +107,6 @@ struct obd_hist_pcpu { bool oh_initialized; }; -enum { - RENAME_SAMEDIR_SIZE = 0, - RENAME_CROSSDIR_SRC_SIZE, - RENAME_CROSSDIR_TGT_SIZE, - RENAME_LAST, -}; - -struct rename_stats { - ktime_t rs_init; - struct obd_histogram rs_hist[RENAME_LAST]; -}; - /* An lprocfs counter can be configured using the enum bit masks below. * * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 8b62753..3c5ebd5 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -5995,16 +5995,19 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, INIT_LIST_HEAD(&m->mdt_squash.rsi_nosquash_nids); spin_lock_init(&m->mdt_squash.rsi_lock); spin_lock_init(&m->mdt_lock); - m->mdt_enable_remote_dir = 1; - m->mdt_enable_striped_dir = 1; + m->mdt_enable_chprojid_gid = 0; m->mdt_enable_dir_migration = 1; m->mdt_enable_dir_restripe = 0; m->mdt_enable_dir_auto_split = 0; + m->mdt_enable_parallel_rename_dir = 1; + m->mdt_enable_parallel_rename_file = 1; + m->mdt_enable_remote_dir = 1; m->mdt_enable_remote_dir_gid = 0; - m->mdt_enable_chprojid_gid = 0; m->mdt_enable_remote_rename = 1; - m->mdt_dir_restripe_nsonly = 1; m->mdt_enable_remote_subdir_mount = 1; + m->mdt_enable_striped_dir = 1; + m->mdt_dir_restripe_nsonly = 1; + m->mdt_rename_stats.rs_init = ktime_get(); atomic_set(&m->mdt_mds_mds_conns, 0); atomic_set(&m->mdt_async_commit_count, 0); diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index a5a9178..8715ec3 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -205,6 +205,18 @@ struct mdt_statfs_cache { __u64 msf_age; }; +enum mdt_rename_type { + RENAME_SAMEDIR_SIZE = 0, + RENAME_CROSSDIR_SRC_SIZE, + RENAME_CROSSDIR_TGT_SIZE, + RENAME_LAST +}; + +struct rename_stats { + ktime_t rs_init; + struct obd_histogram rs_hist[RENAME_LAST]; +}; + /* split directory automatically when sub file count exceeds 50k */ #define DIR_SPLIT_COUNT_DEFAULT 50000 @@ -262,35 +274,37 @@ struct mdt_device { mo_dom_read_open:1, mo_migrate_hsm_allowed:1, mo_enable_strict_som:1; - unsigned int mo_dom_lock; + unsigned int mo_dom_lock; } mdt_opts; - /* mdt state flags */ - unsigned long mdt_state; + /* mdt state flags */ + unsigned long mdt_state; - /* transaction callbacks */ - struct dt_txn_callback mdt_txn_cb; + /* transaction callbacks */ + struct dt_txn_callback mdt_txn_cb; - /* these values should be updated from lov if necessary. - * or should be placed somewhere else. */ - int mdt_max_mdsize; + /* these values should be updated from lov if necessary. + * or should be placed somewhere else. */ + int mdt_max_mdsize; int mdt_max_ea_size; /* preferred BRW size, decided by storage type and capability */ __u32 mdt_brw_size; - struct upcall_cache *mdt_identity_cache; + struct upcall_cache *mdt_identity_cache; - unsigned int mdt_capa_conf:1, + unsigned int mdt_capa_conf:1, /* Enable remote dir on non-MDT0 */ - mdt_enable_remote_dir:1, - mdt_enable_striped_dir:1, mdt_enable_dir_migration:1, mdt_enable_dir_restripe:1, mdt_enable_dir_auto_split:1, + mdt_enable_parallel_rename_dir:1, + mdt_enable_parallel_rename_file:1, + mdt_enable_remote_dir:1, mdt_enable_remote_rename:1, - mdt_skip_lfsck:1, + mdt_enable_striped_dir:1, mdt_readonly:1, + mdt_skip_lfsck:1, /* dir restripe migrate dirent only */ mdt_dir_restripe_nsonly:1, /* subdirectory mount of remote dir */ @@ -310,9 +324,9 @@ struct mdt_device { struct mdt_statfs_cache mdt_osfs; /* root squash */ - struct root_squash_info mdt_squash; + struct root_squash_info mdt_squash; - struct rename_stats mdt_rename_stats; + struct rename_stats mdt_rename_stats; struct lu_fid mdt_md_root_fid; /* connection to quota master */ @@ -1298,8 +1312,10 @@ enum mdt_stat_idx { LPROC_MDT_SETXATTR, LPROC_MDT_STATFS, LPROC_MDT_SYNC, - LPROC_MDT_SAMEDIR_RENAME, - LPROC_MDT_CROSSDIR_RENAME, + LPROC_MDT_RENAME_SAMEDIR, + LPROC_MDT_RENAME_PAR_FILE, + LPROC_MDT_RENAME_PAR_DIR, + LPROC_MDT_RENAME_CROSSDIR, LPROC_MDT_IO_READ, LPROC_MDT_IO_WRITE, LPROC_MDT_IO_READ_BYTES, @@ -1322,7 +1338,7 @@ void mdt_rename_counter_tally(struct mdt_thread_info *info, struct mdt_device *mdt, struct ptlrpc_request *req, struct mdt_object *src, struct mdt_object *tgt, - long count); + enum mdt_stat_idx msi, s64 count); static inline struct obd_device *mdt2obd_dev(const struct mdt_device *mdt) { diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c index 78fb699..6dfc117 100644 --- a/lustre/mdt/mdt_lproc.c +++ b/lustre/mdt/mdt_lproc.c @@ -164,13 +164,15 @@ static int lproc_mdt_attach_rename_seqstat(struct mdt_device *mdt) void mdt_rename_counter_tally(struct mdt_thread_info *info, struct mdt_device *mdt, struct ptlrpc_request *req, - struct mdt_object *src, - struct mdt_object *tgt, long count) + struct mdt_object *src, struct mdt_object *tgt, + enum mdt_stat_idx msi, s64 ktime_delta) { struct md_attr *ma = &info->mti_attr; struct rename_stats *rstats = &mdt->mdt_rename_stats; int rc; + mdt_counter_incr(req, LPROC_MDT_RENAME, ktime_delta); + ma->ma_need = MA_INODE; ma->ma_valid = 0; rc = mo_attr_get(info->mti_env, mdt_object_child(src), ma); @@ -181,13 +183,15 @@ void mdt_rename_counter_tally(struct mdt_thread_info *info, } if (src == tgt) { - mdt_counter_incr(req, LPROC_MDT_SAMEDIR_RENAME, count); + mdt_counter_incr(req, LPROC_MDT_RENAME_SAMEDIR, ktime_delta); + if (msi) /* parallel rename type */ + mdt_counter_incr(req, msi, ktime_delta); lprocfs_oh_tally_log2(&rstats->rs_hist[RENAME_SAMEDIR_SIZE], (unsigned int)ma->ma_attr.la_size); return; } - mdt_counter_incr(req, LPROC_MDT_CROSSDIR_RENAME, count); + mdt_counter_incr(req, LPROC_MDT_RENAME_CROSSDIR, ktime_delta); lprocfs_oh_tally_log2(&rstats->rs_hist[RENAME_CROSSDIR_SRC_SIZE], (unsigned int)ma->ma_attr.la_size); @@ -708,6 +712,72 @@ static ssize_t enable_chprojid_gid_store(struct kobject *kobj, } LUSTRE_RW_ATTR(enable_chprojid_gid); +static ssize_t enable_parallel_rename_dir_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + mdt->mdt_enable_parallel_rename_dir); +} + +static ssize_t enable_parallel_rename_dir_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + mdt->mdt_enable_parallel_rename_dir = val; + + return count; +} +LUSTRE_RW_ATTR(enable_parallel_rename_dir); + +static ssize_t enable_parallel_rename_file_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + mdt->mdt_enable_parallel_rename_file); +} + +static ssize_t enable_parallel_rename_file_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + mdt->mdt_enable_parallel_rename_file = val; + + return count; +} +LUSTRE_RW_ATTR(enable_parallel_rename_file); + static ssize_t enable_striped_dir_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1451,14 +1521,16 @@ static struct attribute *mdt_attrs[] = { &lustre_attr_identity_upcall.attr, &lustre_attr_identity_flush.attr, &lustre_attr_evict_tgt_nids.attr, - &lustre_attr_enable_remote_dir.attr, - &lustre_attr_enable_remote_dir_gid.attr, &lustre_attr_enable_chprojid_gid.attr, - &lustre_attr_enable_striped_dir.attr, &lustre_attr_enable_dir_migration.attr, &lustre_attr_enable_dir_restripe.attr, &lustre_attr_enable_dir_auto_split.attr, + &lustre_attr_enable_parallel_rename_dir.attr, + &lustre_attr_enable_parallel_rename_file.attr, + &lustre_attr_enable_remote_dir.attr, + &lustre_attr_enable_remote_dir_gid.attr, &lustre_attr_enable_remote_rename.attr, + &lustre_attr_enable_striped_dir.attr, &lustre_attr_commit_on_sharing.attr, &lustre_attr_local_recovery.attr, &lustre_attr_async_commit_count.attr, @@ -1575,8 +1647,10 @@ static const char * const mdt_stats[] = { [LPROC_MDT_SETXATTR] = "setxattr", [LPROC_MDT_STATFS] = "statfs", [LPROC_MDT_SYNC] = "sync", - [LPROC_MDT_SAMEDIR_RENAME] = "samedir_rename", - [LPROC_MDT_CROSSDIR_RENAME] = "crossdir_rename", + [LPROC_MDT_RENAME_SAMEDIR] = "samedir_rename", + [LPROC_MDT_RENAME_PAR_FILE] = "parallel_rename_file", + [LPROC_MDT_RENAME_PAR_DIR] = "parallel_rename_dir", + [LPROC_MDT_RENAME_CROSSDIR] = "crossdir_rename", [LPROC_MDT_IO_READ_BYTES] = "read_bytes", [LPROC_MDT_IO_WRITE_BYTES] = "write_bytes", [LPROC_MDT_IO_READ] = "read", diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index 3f1eca6..a60f1d1 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -2694,6 +2694,7 @@ static int mdt_reint_rename(struct mdt_thread_info *info, bool reverse = false, discard = false; bool cos_incompat; ktime_t kstart = ktime_get(); + enum mdt_stat_idx msi = 0; int rc; ENTRY; @@ -2739,6 +2740,8 @@ static int mdt_reint_rename(struct mdt_thread_info *info, * get rename lock, which will cause deadlock. */ if (!req_is_replay(req)) { + bool remote = mdt_object_remote(msrcdir); + /* * Normally rename RPC is handled on the MDT with the target * directory (if target exists, it's on the MDT with the @@ -2747,16 +2750,21 @@ static int mdt_reint_rename(struct mdt_thread_info *info, * cause any issue), return -EXDEV early to avoid taking * rename_lock. */ - if (!mdt->mdt_enable_remote_rename && - mdt_object_remote(msrcdir)) + if (!mdt->mdt_enable_remote_rename && remote) GOTO(out_put_tgtdir, rc = -EXDEV); /* This might be further relaxed in the future for regular file * renames in different source and target parents. Start with * only same-directory renames for simplicity and because this * is by far the most the common use case. + * + * Striped directories should be considered "remote". */ - if (msrcdir != mtgtdir) { + if (msrcdir != mtgtdir || remote || + (S_ISDIR(ma->ma_attr.la_mode) && + !mdt->mdt_enable_parallel_rename_dir) || + (!S_ISDIR(ma->ma_attr.la_mode) && + !mdt->mdt_enable_parallel_rename_file)) { rc = mdt_rename_lock(info, &rename_lh); if (rc != 0) { CERROR("%s: cannot lock for rename: rc = %d\n", @@ -2764,7 +2772,13 @@ static int mdt_reint_rename(struct mdt_thread_info *info, GOTO(out_put_tgtdir, rc); } } else { - CDEBUG(D_INFO, "%s: samedir rename "DFID"/"DNAME"\n", + if (S_ISDIR(ma->ma_attr.la_mode)) + msi = LPROC_MDT_RENAME_PAR_DIR; + else + msi = LPROC_MDT_RENAME_PAR_FILE; + + CDEBUG(D_INFO, + "%s: samedir parallel rename "DFID"/"DNAME"\n", mdt_obd_name(mdt), PFID(rr->rr_fid1), PNAME(&rr->rr_name)); } @@ -3004,14 +3018,12 @@ relock: /* handle last link of tgt object */ if (rc == 0) { - mdt_counter_incr(req, LPROC_MDT_RENAME, - ktime_us_delta(ktime_get(), kstart)); if (mnew) { mdt_handle_last_unlink(info, mnew, ma); discard = mdt_dom_check_for_discard(info, mnew); } mdt_rename_counter_tally(info, info->mti_mdt, req, - msrcdir, mtgtdir, + msrcdir, mtgtdir, msi, ktime_us_delta(ktime_get(), kstart)); } -- 1.8.3.1