From: Lai Siyao Date: Tue, 16 Jan 2024 19:18:30 +0000 (-0500) Subject: LU-17434 lmv: add exclude list for remote dir X-Git-Tag: 2.15.62~117 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=5b07dce19b1830769d7a1f7bba8b559d3ead9dfb;p=fs%2Flustre-release.git LU-17434 lmv: add exclude list for remote dir Apache Spark creating a _temporary subdirectory for staging files, and it should be created on the same MDT as its parent directory. Add a tunable lmv.*.qos_exclude_prefixes, if directory prefix is in this list, lmv_create() should put it on its parent MDT. This prefix list follows the same rule of shell environment PATH: use ':' as separator for prefixes. And for convenience '+/-' can be used to add/remove prefixes. Add sanity 413k. Signed-off-by: Lai Siyao Change-Id: I4c8a118f0630c19054934a87bee3599bdb1fe7bb Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53780 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Sebastien Buisson Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 774b151..c16d7ef 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -424,6 +424,12 @@ struct lov_obd { #define lmv_tgt_desc lu_tgt_desc +struct qos_exclude_prefix { + struct list_head qep_list; + struct rhash_head qep_hash; + char qep_name[NAME_MAX + 1]; +}; + struct lmv_obd { struct lu_client_fld lmv_fld; spinlock_t lmv_lock; @@ -440,6 +446,8 @@ struct lmv_obd { void *lmv_cache; __u32 lmv_qos_rr_index; /* next round-robin MDT idx */ + struct rhashtable lmv_qos_exclude_hash; + struct list_head lmv_qos_exclude_list; }; #define lmv_mdt_count lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count diff --git a/lustre/lmv/lmv_internal.h b/lustre/lmv/lmv_internal.h index b6c1591..41b9b12 100644 --- a/lustre/lmv/lmv_internal.h +++ b/lustre/lmv/lmv_internal.h @@ -213,6 +213,9 @@ struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv, struct md_op_data *op_data); int lmv_old_layout_lookup(struct lmv_obd *lmv, struct md_op_data *op_data); +extern const struct rhashtable_params qos_exclude_hash_params; +void qos_exclude_prefix_free(void *vprefix, void *data); + /* lproc_lmv.c */ int lmv_tunables_init(struct obd_device *obd); #endif diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 0dc0812..8046f77 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -1201,11 +1201,46 @@ int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, RETURN(rc); } +static u32 qos_exclude_hashfh(const void *data, u32 len, u32 seed) +{ + const char *name = data; + + return hashlen_hash(cfs_hashlen_string((void *)(unsigned long)seed, + name)); +} + +static int qos_exclude_cmpfn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct qos_exclude_prefix *prefix = obj; + const char *name = arg->key; + + return strcmp(name, prefix->qep_name); +} + +const struct rhashtable_params qos_exclude_hash_params = { + .key_len = 1, /* actually variable */ + .key_offset = offsetof(struct qos_exclude_prefix, qep_name), + .head_offset = offsetof(struct qos_exclude_prefix, qep_hash), + .hashfn = qos_exclude_hashfh, + .obj_cmpfn = qos_exclude_cmpfn, + .automatic_shrinking = true, +}; + +void qos_exclude_prefix_free(void *vprefix, void *data) +{ + struct qos_exclude_prefix *prefix = vprefix; + + list_del(&prefix->qep_list); + kfree(prefix); +} + static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) { struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_desc *desc; + struct lmv_desc *desc; struct lnet_processid lnet_id; + struct qos_exclude_prefix *prefix; int i = 0; int rc; @@ -1233,6 +1268,7 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) lmv->max_easize = 0; spin_lock_init(&lmv->lmv_lock); + INIT_LIST_HEAD(&lmv->lmv_qos_exclude_list); /* * initialize rr_index to lower 32bit of netid, so that client @@ -1260,7 +1296,32 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) CWARN("%s: error initialize target table: rc = %d\n", obd->obd_name, rc); - RETURN(rc); + rc = rhashtable_init(&lmv->lmv_qos_exclude_hash, + &qos_exclude_hash_params); + if (rc) { + CERROR("%s: qos exclude hash initalize failed: %d\n", + obd->obd_name, rc); + RETURN(rc); + } + + prefix = kmalloc(sizeof(*prefix), __GFP_ZERO); + if (!prefix) + GOTO(out, rc = -ENOMEM); + /* Apache Spark creates a _temporary directory for staging files */ + strcpy(prefix->qep_name, "_temporary"); + rc = rhashtable_insert_fast(&lmv->lmv_qos_exclude_hash, + &prefix->qep_hash, qos_exclude_hash_params); + if (rc) { + kfree(prefix); + GOTO(out, rc); + } + + list_add_tail(&prefix->qep_list, &lmv->lmv_qos_exclude_list); + GOTO(out, rc); +out: + if (rc) + rhashtable_destroy(&lmv->lmv_qos_exclude_hash); + return rc; } static int lmv_cleanup(struct obd_device *obd) @@ -1271,6 +1332,8 @@ static int lmv_cleanup(struct obd_device *obd) ENTRY; + rhashtable_free_and_destroy(&lmv->lmv_qos_exclude_hash, + qos_exclude_prefix_free, NULL); fld_client_fini(&lmv->lmv_fld); fld_client_debugfs_fini(&lmv->lmv_fld); @@ -2046,6 +2109,37 @@ static struct lu_tgt_desc *lmv_locate_tgt_by_space(struct lmv_obd *lmv, return tgt; } +static bool lmv_qos_exclude(struct lmv_obd *lmv, struct md_op_data *op_data) +{ + const char *name = op_data->op_name; + size_t namelen = op_data->op_namelen; + char buf[NAME_MAX + 1]; + struct qos_exclude_prefix *prefix; + char *p; + + /* skip encrypted files */ + if (op_data->op_file_encctx) + return false; + + /* name length may not be validated yet */ + if (namelen > NAME_MAX) + return false; + + p = strrchr(name, '.'); + if (p) { + namelen = p - name; + if (!namelen) + return false; + strncpy(buf, name, namelen); + buf[namelen] = '\0'; + name = buf; + } + + prefix = rhashtable_lookup_fast(&lmv->lmv_qos_exclude_hash, name, + qos_exclude_hash_params); + return prefix != NULL; +} + static int lmv_create(struct obd_export *exp, struct md_op_data *op_data, const void *data, size_t datalen, umode_t mode, uid_t uid, gid_t gid, kernel_cap_t cap_effective, __u64 rdev, @@ -2112,7 +2206,8 @@ static int lmv_create(struct obd_export *exp, struct md_op_data *op_data, RETURN(-ENODEV); if (unlikely(tgt->ltd_statfs.os_state & OS_STATFS_NOCREATE)) GOTO(new_tgt, -EAGAIN); - } else if (lmv_op_default_qos_mkdir(op_data) || + } else if ((lmv_op_default_qos_mkdir(op_data) && + !lmv_qos_exclude(lmv, op_data)) || tgt->ltd_statfs.os_state & OS_STATFS_NOCREATE) { new_tgt: tgt = lmv_locate_tgt_by_space(lmv, op_data, tgt); diff --git a/lustre/lmv/lproc_lmv.c b/lustre/lmv/lproc_lmv.c index c2a5db2..4aa4583 100644 --- a/lustre/lmv/lproc_lmv.c +++ b/lustre/lmv/lproc_lmv.c @@ -198,6 +198,151 @@ static ssize_t qos_threshold_rr_store(struct kobject *kobj, LUSTRE_RW_ATTR(qos_threshold_rr); #ifdef CONFIG_PROC_FS +/* directories with exclude prefixes will be created on the same MDT as its + * parent directory, the prefixes are set with the rule as shell environment + * PATH: ':' is used as separator for prefixes. And for convenience, '+/-' is + * used to add/remove prefixes. + */ +static int qos_exclude_prefixes_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *obd = m->private; + struct lmv_obd *lmv = &obd->u.lmv; + struct qos_exclude_prefix *prefix; + +restart: + spin_lock(&lmv->lmv_lock); + list_for_each_entry(prefix, &lmv->lmv_qos_exclude_list, qep_list) { + seq_printf(m, "%s\n", prefix->qep_name); + if (seq_has_overflowed(m)) { + spin_unlock(&lmv->lmv_lock); + kvfree(m->buf); + m->count = 0; + m->buf = kvmalloc(m->size <<= 1, GFP_KERNEL_ACCOUNT); + if (!m->buf) + return -ENOMEM; + goto restart; + } + } + spin_unlock(&lmv->lmv_lock); + + return 0; +} + +static ssize_t qos_exclude_prefixes_seq_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *off) +{ + struct obd_device *obd; + struct lmv_obd *lmv; + char *buf; + char op = 0; + char *p; + char *name; + char namebuf[NAME_MAX + 1]; + struct qos_exclude_prefix *prefix; + struct qos_exclude_prefix *tmp; + int len; + bool pruned = false; + int rc; + + /* one extra char to ensure buf ends with '\0' */ + OBD_ALLOC(buf, count + 1); + if (!buf) + return -ENOMEM; + if (copy_from_user(buf, buffer, count)) { + OBD_FREE(buf, count + 1); + return -EFAULT; + } + + obd = ((struct seq_file *)file->private_data)->private; + lmv = &obd->u.lmv; + p = buf; + while (p) { + while (*p == ':') + p++; + if (*p == '\0') + break; + if (*p == '+' || *p == '-') + op = *p++; + + name = p; + p = strchr(name, ':'); + if (p) + len = p - name; + else + len = strlen(name); + if (!len) + break; + if (len > NAME_MAX) { + CERROR("%s: %s length exceeds NAME_MAX\n", + obd->obd_name, name); + OBD_FREE(buf, count + 1); + return -ERANGE; + } + + switch (op) { + default: + if (!pruned) { + spin_lock(&lmv->lmv_lock); + list_for_each_entry_safe(prefix, tmp, + &lmv->lmv_qos_exclude_list, + qep_list) { + list_del(&prefix->qep_list); + rhashtable_remove_fast( + &lmv->lmv_qos_exclude_hash, + &prefix->qep_hash, + qos_exclude_hash_params); + kfree(prefix); + } + spin_unlock(&lmv->lmv_lock); + pruned = true; + } + fallthrough; + case '+': + prefix = kmalloc(sizeof(*prefix), __GFP_ZERO); + if (!prefix) { + OBD_FREE(buf, count + 1); + return -ENOMEM; + } + strncpy(prefix->qep_name, name, len); + rc = rhashtable_lookup_insert_fast( + &lmv->lmv_qos_exclude_hash, + &prefix->qep_hash, + qos_exclude_hash_params); + if (!rc) { + spin_lock(&lmv->lmv_lock); + list_add_tail(&prefix->qep_list, + &lmv->lmv_qos_exclude_list); + spin_unlock(&lmv->lmv_lock); + } else { + kfree(prefix); + } + break; + case '-': + strncpy(namebuf, name, len); + namebuf[len] = '\0'; + prefix = rhashtable_lookup(&lmv->lmv_qos_exclude_hash, + namebuf, + qos_exclude_hash_params); + if (prefix) { + spin_lock(&lmv->lmv_lock); + list_del(&prefix->qep_list); + spin_unlock(&lmv->lmv_lock); + rhashtable_remove_fast( + &lmv->lmv_qos_exclude_hash, + &prefix->qep_hash, + qos_exclude_hash_params); + kfree(prefix); + } + break; + } + } + + OBD_FREE(buf, count + 1); + return count; +} +LPROC_SEQ_FOPS(qos_exclude_prefixes); + static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos) { struct obd_device *obd = p->private; @@ -278,6 +423,14 @@ static const struct proc_ops lmv_proc_target_fops = { .proc_lseek = seq_lseek, .proc_release = seq_release, }; + +struct lprocfs_vars lprocfs_lmv_obd_vars[] = { + { .name = "qos_exclude_prefixes", + .fops = &qos_exclude_prefixes_fops }, + { .name = "target_obd", + .fops = &lmv_proc_target_fops }, + { NULL } +}; #endif /* CONFIG_PROC_FS */ static struct attribute *lmv_attrs[] = { @@ -297,6 +450,9 @@ int lmv_tunables_init(struct obd_device *obd) int rc; obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(lmv); +#ifdef CONFIG_PROC_FS + obd->obd_vars = lprocfs_lmv_obd_vars; +#endif rc = lprocfs_obd_setup(obd, true); if (rc) goto out_failed; @@ -306,16 +462,6 @@ int lmv_tunables_init(struct obd_device *obd) lprocfs_obd_cleanup(obd); goto out_failed; } - - rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd", - 0444, &lmv_proc_target_fops, obd); - if (rc) { - lprocfs_free_md_stats(obd); - lprocfs_obd_cleanup(obd); - CWARN("%s: error adding LMV target_obd file: rc = %d\n", - obd->obd_name, rc); - rc = 0; - } #endif /* CONFIG_PROC_FS */ out_failed: return rc; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index f838810..fed7896 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -29447,6 +29447,43 @@ test_413j() } run_test 413j "set default LMV by setxattr" +test_413k() { + (( $MDS1_VERSION >= $(version_code 2.15.60) )) || + skip "Need server version at least 2.15.60" + + local index1 + local index2 + local old=$($LCTL get_param -n lmv.*.qos_exclude_prefixes) + local count=$($LCTL get_param -n lmv.*.qos_exclude_prefixes | wc -l) + local prefixes="abc:123:foo bar" + + # add prefixes + stack_trap "$LCTL set_param lmv.*.qos_exclude_prefixes=\"$old\"" + $LCTL set_param lmv.*.qos_exclude_prefixes="+$prefixes" + + mkdir $DIR/$tdir || error "mkdir $tdir failed" + index1=$($LFS getstripe -m $DIR/$tdir) + for dname in _temporary _temporary.XXXXXX abc 123 "foo bar"; do + mkdir "$DIR/$tdir/$dname" || error "mkdir $dname failed" + index2=$($LFS getstripe -m "$DIR/$tdir/$dname") + ((index1 == index2)) || + error "$tdir on MDT$index1, $dname on MDT$index2" + done + + # remove prefixes + $LCTL set_param lmv.*.qos_exclude_prefixes="-$prefixes" + + # total prefixes length > PAGE_SIZE can be printed correctly + for c in {a..z}; do + prefixes=$(str_repeat $c 255) + $LCTL set_param lmv.*.qos_exclude_prefixes="+$prefixes" >/dev/null + done + local count2=$($LCTL get_param -n lmv.*.qos_exclude_prefixes | wc -l) + ((count2 == count + 26)) || + error "prefixes count $count2 != $((count + 26))" +} +run_test 413k "QoS mkdir exclude prefixes" + test_413z() { local pids="" local subdir