From e37ad8c8495ef31216d43ed82c0b947a719e2761 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Fri, 10 Jun 2022 19:47:00 -0600 Subject: [PATCH] LU-15913 mdt: disable parallel rename for striped dirs Parallel rename should not be done within striped directories to avoid remote updates. These are like cross-directory renames. Add tunables for parallel directory rename in case of problems. These can be configured separately for files and directories. mdt.*.enable_parallel_rename_dir mdt.*.enable_parallel_rename_file Lustre-change: https://review.whamcloud.com/47593 Lustre-commit: f238540c879dc668e18cf99cba62f117ccae64d6 Fixes: 90979ab390 ("LU-12125 mds: allow parallel directory rename") Fixes: d76cc65d5d ("LU-12125 mds: allow parallel regular file rename") Signed-off-by: Andreas Dilger Change-Id: I384976cd1c9f401169336ee7a479ba0e3dd9f4ee Reviewed-by: Artem Blagodarenko Reviewed-by: Lai Siyao Reviewed-on: https://review.whamcloud.com/48124 Tested-by: jenkins Tested-by: Maloo --- lustre/include/lprocfs_status.h | 11 ---- lustre/mdt/mdt_handler.c | 13 ++-- lustre/mdt/mdt_internal.h | 52 ++++++++++------ lustre/mdt/mdt_lproc.c | 134 +++++++++++++++++++++++++++++++--------- lustre/mdt/mdt_reint.c | 26 +++++--- 5 files changed, 165 insertions(+), 71 deletions(-) diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index 0307933..70c4212 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -125,17 +125,6 @@ struct brw_stats { struct obd_histogram hist[BRW_LAST]; }; -enum { - RENAME_SAMEDIR_SIZE = 0, - RENAME_CROSSDIR_SRC_SIZE, - RENAME_CROSSDIR_TGT_SIZE, - RENAME_LAST, -}; - -struct rename_stats { - struct obd_histogram hist[RENAME_LAST]; -}; - /* An lprocfs counter can be configured using the enum bit masks below. * * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 1b0fbf3..475f752 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -6003,17 +6003,20 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, INIT_LIST_HEAD(&m->mdt_squash.rsi_nosquash_nids); spin_lock_init(&m->mdt_squash.rsi_lock); spin_lock_init(&m->mdt_lock); - m->mdt_enable_remote_dir = 1; - m->mdt_enable_striped_dir = 1; + m->mdt_enable_chprojid_gid = 0; m->mdt_enable_dir_migration = 1; m->mdt_enable_dir_restripe = 0; m->mdt_enable_dir_auto_split = 0; - m->mdt_enable_remote_dir_gid = 0; - m->mdt_enable_chprojid_gid = 0; + m->mdt_enable_parallel_rename_dir = 1; + m->mdt_enable_parallel_rename_file = 1; m->mdt_enable_pin_gid = 0; + m->mdt_enable_remote_dir = 1; + m->mdt_enable_remote_dir_gid = 0; m->mdt_enable_remote_rename = 1; - m->mdt_dir_restripe_nsonly = 1; m->mdt_enable_remote_subdir_mount = 1; + m->mdt_enable_striped_dir = 1; + m->mdt_dir_restripe_nsonly = 1; + m->mdt_rename_stats.rs_init = ktime_get(); atomic_set(&m->mdt_mds_mds_conns, 0); atomic_set(&m->mdt_async_commit_count, 0); diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index 01bfc4c..b756131 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -207,6 +207,18 @@ struct mdt_statfs_cache { __u64 msf_age; }; +enum mdt_rename_type { + RENAME_SAMEDIR_SIZE = 0, + RENAME_CROSSDIR_SRC_SIZE, + RENAME_CROSSDIR_TGT_SIZE, + RENAME_LAST +}; + +struct rename_stats { + ktime_t rs_init; + struct obd_histogram rs_hist[RENAME_LAST]; +}; + /* split directory automatically when sub file count exceeds 50k */ #define DIR_SPLIT_COUNT_DEFAULT 50000 @@ -264,35 +276,37 @@ struct mdt_device { mo_dom_read_open:1, mo_migrate_hsm_allowed:1, mo_enable_strict_som:1; - unsigned int mo_dom_lock; + unsigned int mo_dom_lock; } mdt_opts; - /* mdt state flags */ - unsigned long mdt_state; + /* mdt state flags */ + unsigned long mdt_state; - /* transaction callbacks */ - struct dt_txn_callback mdt_txn_cb; + /* transaction callbacks */ + struct dt_txn_callback mdt_txn_cb; - /* these values should be updated from lov if necessary. - * or should be placed somewhere else. */ - int mdt_max_mdsize; + /* these values should be updated from lov if necessary. + * or should be placed somewhere else. */ + int mdt_max_mdsize; int mdt_max_ea_size; /* preferred BRW size, decided by storage type and capability */ __u32 mdt_brw_size; - struct upcall_cache *mdt_identity_cache; + struct upcall_cache *mdt_identity_cache; - unsigned int mdt_capa_conf:1, + unsigned int mdt_capa_conf:1, /* Enable remote dir on non-MDT0 */ - mdt_enable_remote_dir:1, - mdt_enable_striped_dir:1, mdt_enable_dir_migration:1, mdt_enable_dir_restripe:1, mdt_enable_dir_auto_split:1, + mdt_enable_parallel_rename_dir:1, + mdt_enable_parallel_rename_file:1, + mdt_enable_remote_dir:1, mdt_enable_remote_rename:1, - mdt_skip_lfsck:1, + mdt_enable_striped_dir:1, mdt_readonly:1, + mdt_skip_lfsck:1, /* dir restripe migrate dirent only */ mdt_dir_restripe_nsonly:1, /* subdirectory mount of remote dir */ @@ -314,9 +328,9 @@ struct mdt_device { struct mdt_statfs_cache mdt_osfs; /* root squash */ - struct root_squash_info mdt_squash; + struct root_squash_info mdt_squash; - struct rename_stats mdt_rename_stats; + struct rename_stats mdt_rename_stats; struct lu_fid mdt_md_root_fid; /* connection to quota master */ @@ -1304,8 +1318,10 @@ enum mdt_stat_idx { LPROC_MDT_SETXATTR, LPROC_MDT_STATFS, LPROC_MDT_SYNC, - LPROC_MDT_SAMEDIR_RENAME, - LPROC_MDT_CROSSDIR_RENAME, + LPROC_MDT_RENAME_SAMEDIR, + LPROC_MDT_RENAME_PAR_FILE, + LPROC_MDT_RENAME_PAR_DIR, + LPROC_MDT_RENAME_CROSSDIR, LPROC_MDT_IO_READ, LPROC_MDT_IO_WRITE, LPROC_MDT_IO_READ_BYTES, @@ -1327,7 +1343,7 @@ void mdt_rename_counter_tally(struct mdt_thread_info *info, struct mdt_device *mdt, struct ptlrpc_request *req, struct mdt_object *src, struct mdt_object *tgt, - long count); + enum mdt_stat_idx msi, s64 count); static inline struct obd_device *mdt2obd_dev(const struct mdt_device *mdt) { diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c index 98ede5f..a28610e 100644 --- a/lustre/mdt/mdt_lproc.c +++ b/lustre/mdt/mdt_lproc.c @@ -120,11 +120,11 @@ static void rename_stats_show(struct seq_file *seq, (s64)now.tv_sec, now.tv_nsec); display_rename_stats(seq, "same_dir", - &rename_stats->hist[RENAME_SAMEDIR_SIZE]); + &rename_stats->rs_hist[RENAME_SAMEDIR_SIZE]); display_rename_stats(seq, "crossdir_src", - &rename_stats->hist[RENAME_CROSSDIR_SRC_SIZE]); + &rename_stats->rs_hist[RENAME_CROSSDIR_SRC_SIZE]); display_rename_stats(seq, "crossdir_tgt", - &rename_stats->hist[RENAME_CROSSDIR_TGT_SIZE]); + &rename_stats->rs_hist[RENAME_CROSSDIR_TGT_SIZE]); } static int mdt_rename_stats_seq_show(struct seq_file *seq, void *v) @@ -145,7 +145,7 @@ mdt_rename_stats_seq_write(struct file *file, const char __user *buf, int i; for (i = 0; i < RENAME_LAST; i++) - lprocfs_oh_clear(&mdt->mdt_rename_stats.hist[i]); + lprocfs_oh_clear(&mdt->mdt_rename_stats.rs_hist[i]); return len; } @@ -156,7 +156,7 @@ static int lproc_mdt_attach_rename_seqstat(struct mdt_device *mdt) int i; for (i = 0; i < RENAME_LAST; i++) - spin_lock_init(&mdt->mdt_rename_stats.hist[i].oh_lock); + spin_lock_init(&mdt->mdt_rename_stats.rs_hist[i].oh_lock); return lprocfs_obd_seq_create(mdt2obd_dev(mdt), "rename_stats", 0644, &mdt_rename_stats_fops, mdt); @@ -165,43 +165,47 @@ static int lproc_mdt_attach_rename_seqstat(struct mdt_device *mdt) void mdt_rename_counter_tally(struct mdt_thread_info *info, struct mdt_device *mdt, struct ptlrpc_request *req, - struct mdt_object *src, - struct mdt_object *tgt, long count) + struct mdt_object *src, struct mdt_object *tgt, + enum mdt_stat_idx msi, s64 ktime_delta) { struct md_attr *ma = &info->mti_attr; struct rename_stats *rstats = &mdt->mdt_rename_stats; int rc; + mdt_counter_incr(req, LPROC_MDT_RENAME, ktime_delta); + ma->ma_need = MA_INODE; ma->ma_valid = 0; rc = mo_attr_get(info->mti_env, mdt_object_child(src), ma); - if (rc) { - CERROR("%s: "DFID" attr_get, rc = %d\n", + if (rc) { + CERROR("%s: "DFID" attr_get, rc = %d\n", mdt_obd_name(mdt), PFID(mdt_object_fid(src)), rc); return; } - if (src == tgt) { - mdt_counter_incr(req, LPROC_MDT_SAMEDIR_RENAME, count); - lprocfs_oh_tally_log2(&rstats->hist[RENAME_SAMEDIR_SIZE], - (unsigned int)ma->ma_attr.la_size); - return; - } + if (src == tgt) { + mdt_counter_incr(req, LPROC_MDT_RENAME_SAMEDIR, ktime_delta); + if (msi) /* parallel rename type */ + mdt_counter_incr(req, msi, ktime_delta); + lprocfs_oh_tally_log2(&rstats->rs_hist[RENAME_SAMEDIR_SIZE], + (unsigned int)ma->ma_attr.la_size); + return; + } - mdt_counter_incr(req, LPROC_MDT_CROSSDIR_RENAME, count); - lprocfs_oh_tally_log2(&rstats->hist[RENAME_CROSSDIR_SRC_SIZE], - (unsigned int)ma->ma_attr.la_size); + mdt_counter_incr(req, LPROC_MDT_RENAME_CROSSDIR, ktime_delta); + lprocfs_oh_tally_log2(&rstats->rs_hist[RENAME_CROSSDIR_SRC_SIZE], + (unsigned int)ma->ma_attr.la_size); - ma->ma_need = MA_INODE; - ma->ma_valid = 0; - rc = mo_attr_get(info->mti_env, mdt_object_child(tgt), ma); - if (rc) { - CERROR("%s: "DFID" attr_get, rc = %d\n", + ma->ma_need = MA_INODE; + ma->ma_valid = 0; + rc = mo_attr_get(info->mti_env, mdt_object_child(tgt), ma); + if (rc) { + CERROR("%s: "DFID" attr_get, rc = %d\n", mdt_obd_name(mdt), PFID(mdt_object_fid(tgt)), rc); return; } - lprocfs_oh_tally_log2(&rstats->hist[RENAME_CROSSDIR_TGT_SIZE], + lprocfs_oh_tally_log2(&rstats->rs_hist[RENAME_CROSSDIR_TGT_SIZE], (unsigned int)ma->ma_attr.la_size); } @@ -739,6 +743,72 @@ static ssize_t enable_pin_gid_store(struct kobject *kobj, } LUSTRE_RW_ATTR(enable_pin_gid); +static ssize_t enable_parallel_rename_dir_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + mdt->mdt_enable_parallel_rename_dir); +} + +static ssize_t enable_parallel_rename_dir_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + mdt->mdt_enable_parallel_rename_dir = val; + + return count; +} +LUSTRE_RW_ATTR(enable_parallel_rename_dir); + +static ssize_t enable_parallel_rename_file_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + mdt->mdt_enable_parallel_rename_file); +} + +static ssize_t enable_parallel_rename_file_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + mdt->mdt_enable_parallel_rename_file = val; + + return count; +} +LUSTRE_RW_ATTR(enable_parallel_rename_file); + static ssize_t enable_striped_dir_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1480,15 +1550,17 @@ static struct attribute *mdt_attrs[] = { &lustre_attr_identity_upcall.attr, &lustre_attr_identity_flush.attr, &lustre_attr_evict_tgt_nids.attr, - &lustre_attr_enable_remote_dir.attr, - &lustre_attr_enable_remote_dir_gid.attr, &lustre_attr_enable_chprojid_gid.attr, - &lustre_attr_enable_pin_gid.attr, - &lustre_attr_enable_striped_dir.attr, &lustre_attr_enable_dir_migration.attr, &lustre_attr_enable_dir_restripe.attr, &lustre_attr_enable_dir_auto_split.attr, + &lustre_attr_enable_parallel_rename_dir.attr, + &lustre_attr_enable_parallel_rename_file.attr, + &lustre_attr_enable_pin_gid.attr, + &lustre_attr_enable_remote_dir.attr, + &lustre_attr_enable_remote_dir_gid.attr, &lustre_attr_enable_remote_rename.attr, + &lustre_attr_enable_striped_dir.attr, &lustre_attr_commit_on_sharing.attr, &lustre_attr_local_recovery.attr, &lustre_attr_async_commit_count.attr, @@ -1605,8 +1677,10 @@ static const char * const mdt_stats[] = { [LPROC_MDT_SETXATTR] = "setxattr", [LPROC_MDT_STATFS] = "statfs", [LPROC_MDT_SYNC] = "sync", - [LPROC_MDT_SAMEDIR_RENAME] = "samedir_rename", - [LPROC_MDT_CROSSDIR_RENAME] = "crossdir_rename", + [LPROC_MDT_RENAME_SAMEDIR] = "samedir_rename", + [LPROC_MDT_RENAME_PAR_FILE] = "parallel_rename_file", + [LPROC_MDT_RENAME_PAR_DIR] = "parallel_rename_dir", + [LPROC_MDT_RENAME_CROSSDIR] = "crossdir_rename", [LPROC_MDT_IO_READ_BYTES] = "read_bytes", [LPROC_MDT_IO_WRITE_BYTES] = "write_bytes", [LPROC_MDT_IO_READ] = "read", diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index eec167a..ee4dd23 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -2691,6 +2691,7 @@ static int mdt_reint_rename(struct mdt_thread_info *info, bool reverse = false, discard = false; bool cos_incompat; ktime_t kstart = ktime_get(); + enum mdt_stat_idx msi = 0; int rc; ENTRY; @@ -2736,6 +2737,8 @@ static int mdt_reint_rename(struct mdt_thread_info *info, * get rename lock, which will cause deadlock. */ if (!req_is_replay(req)) { + bool remote = mdt_object_remote(msrcdir); + /* * Normally rename RPC is handled on the MDT with the target * directory (if target exists, it's on the MDT with the @@ -2744,16 +2747,21 @@ static int mdt_reint_rename(struct mdt_thread_info *info, * cause any issue), return -EXDEV early to avoid taking * rename_lock. */ - if (!mdt->mdt_enable_remote_rename && - mdt_object_remote(msrcdir)) + if (!mdt->mdt_enable_remote_rename && remote) GOTO(out_put_tgtdir, rc = -EXDEV); /* This might be further relaxed in the future for regular file * renames in different source and target parents. Start with * only same-directory renames for simplicity and because this * is by far the most the common use case. + * + * Striped directories should be considered "remote". */ - if (msrcdir != mtgtdir) { + if (msrcdir != mtgtdir || remote || + (S_ISDIR(ma->ma_attr.la_mode) && + !mdt->mdt_enable_parallel_rename_dir) || + (!S_ISDIR(ma->ma_attr.la_mode) && + !mdt->mdt_enable_parallel_rename_file)) { rc = mdt_rename_lock(info, &rename_lh); if (rc != 0) { CERROR("%s: cannot lock for rename: rc = %d\n", @@ -2761,7 +2769,13 @@ static int mdt_reint_rename(struct mdt_thread_info *info, GOTO(out_put_tgtdir, rc); } } else { - CDEBUG(D_INFO, "%s: samedir rename "DFID"/"DNAME"\n", + if (S_ISDIR(ma->ma_attr.la_mode)) + msi = LPROC_MDT_RENAME_PAR_DIR; + else + msi = LPROC_MDT_RENAME_PAR_FILE; + + CDEBUG(D_INFO, + "%s: samedir parallel rename "DFID"/"DNAME"\n", mdt_obd_name(mdt), PFID(rr->rr_fid1), PNAME(&rr->rr_name)); } @@ -3001,14 +3015,12 @@ relock: /* handle last link of tgt object */ if (rc == 0) { - mdt_counter_incr(req, LPROC_MDT_RENAME, - ktime_us_delta(ktime_get(), kstart)); if (mnew) { mdt_handle_last_unlink(info, mnew, ma); discard = mdt_dom_check_for_discard(info, mnew); } mdt_rename_counter_tally(info, info->mti_mdt, req, - msrcdir, mtgtdir, + msrcdir, mtgtdir, msi, ktime_us_delta(ktime_get(), kstart)); } -- 1.8.3.1