Whamcloud - gitweb
LU-17434 lmv: add exclude list for remote dir 80/53780/5
authorLai Siyao <lai.siyao@whamcloud.com>
Tue, 16 Jan 2024 19:18:30 +0000 (14:18 -0500)
committerOleg Drokin <green@whamcloud.com>
Wed, 13 Mar 2024 03:23:48 +0000 (03:23 +0000)
Apache Spark creating a _temporary subdirectory for staging files, and
it should be created on the same MDT as its parent directory. Add a
tunable lmv.*.qos_exclude_prefixes, if directory prefix is in this
list, lmv_create() should put it on its parent MDT.

This prefix list follows the same rule of shell environment PATH: use
':' as separator for prefixes. And for convenience '+/-' can be used
to add/remove prefixes.

Add sanity 413k.

Signed-off-by: Lai Siyao <lai.siyao@whamcloud.com>
Change-Id: I4c8a118f0630c19054934a87bee3599bdb1fe7bb
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53780
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd.h
lustre/lmv/lmv_internal.h
lustre/lmv/lmv_obd.c
lustre/lmv/lproc_lmv.c
lustre/tests/sanity.sh

index 774b151..c16d7ef 100644 (file)
@@ -424,6 +424,12 @@ struct lov_obd {
 
 #define lmv_tgt_desc lu_tgt_desc
 
+struct qos_exclude_prefix {
+       struct list_head        qep_list;
+       struct rhash_head       qep_hash;
+       char                    qep_name[NAME_MAX + 1];
+};
+
 struct lmv_obd {
        struct lu_client_fld    lmv_fld;
        spinlock_t              lmv_lock;
@@ -440,6 +446,8 @@ struct lmv_obd {
        void                    *lmv_cache;
 
        __u32                   lmv_qos_rr_index; /* next round-robin MDT idx */
+       struct rhashtable       lmv_qos_exclude_hash;
+       struct list_head        lmv_qos_exclude_list;
 };
 
 #define lmv_mdt_count  lmv_mdt_descs.ltd_lmv_desc.ld_tgt_count
index b6c1591..41b9b12 100644 (file)
@@ -213,6 +213,9 @@ struct lmv_tgt_desc *lmv_locate_tgt(struct lmv_obd *lmv,
                                    struct md_op_data *op_data);
 int lmv_old_layout_lookup(struct lmv_obd *lmv, struct md_op_data *op_data);
 
+extern const struct rhashtable_params qos_exclude_hash_params;
+void qos_exclude_prefix_free(void *vprefix, void *data);
+
 /* lproc_lmv.c */
 int lmv_tunables_init(struct obd_device *obd);
 #endif
index 0dc0812..8046f77 100644 (file)
@@ -1201,11 +1201,46 @@ int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp,
        RETURN(rc);
 }
 
+static u32 qos_exclude_hashfh(const void *data, u32 len, u32 seed)
+{
+       const char *name = data;
+
+       return hashlen_hash(cfs_hashlen_string((void *)(unsigned long)seed,
+                                              name));
+}
+
+static int qos_exclude_cmpfn(struct rhashtable_compare_arg *arg,
+                            const void *obj)
+{
+       const struct qos_exclude_prefix *prefix = obj;
+       const char *name = arg->key;
+
+       return strcmp(name, prefix->qep_name);
+}
+
+const struct rhashtable_params qos_exclude_hash_params = {
+       .key_len        = 1, /* actually variable */
+       .key_offset     = offsetof(struct qos_exclude_prefix, qep_name),
+       .head_offset    = offsetof(struct qos_exclude_prefix, qep_hash),
+       .hashfn         = qos_exclude_hashfh,
+       .obj_cmpfn      = qos_exclude_cmpfn,
+       .automatic_shrinking = true,
+};
+
+void qos_exclude_prefix_free(void *vprefix, void *data)
+{
+       struct qos_exclude_prefix *prefix = vprefix;
+
+       list_del(&prefix->qep_list);
+       kfree(prefix);
+}
+
 static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 {
        struct lmv_obd *lmv = &obd->u.lmv;
-       struct lmv_desc *desc;
+       struct lmv_desc *desc;
        struct lnet_processid lnet_id;
+       struct qos_exclude_prefix *prefix;
        int i = 0;
        int rc;
 
@@ -1233,6 +1268,7 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
        lmv->max_easize = 0;
 
        spin_lock_init(&lmv->lmv_lock);
+       INIT_LIST_HEAD(&lmv->lmv_qos_exclude_list);
 
        /*
         * initialize rr_index to lower 32bit of netid, so that client
@@ -1260,7 +1296,32 @@ static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                CWARN("%s: error initialize target table: rc = %d\n",
                      obd->obd_name, rc);
 
-       RETURN(rc);
+       rc = rhashtable_init(&lmv->lmv_qos_exclude_hash,
+                            &qos_exclude_hash_params);
+       if (rc) {
+               CERROR("%s: qos exclude hash initalize failed: %d\n",
+                      obd->obd_name, rc);
+               RETURN(rc);
+       }
+
+       prefix = kmalloc(sizeof(*prefix), __GFP_ZERO);
+       if (!prefix)
+               GOTO(out, rc = -ENOMEM);
+       /* Apache Spark creates a _temporary directory for staging files */
+       strcpy(prefix->qep_name, "_temporary");
+       rc = rhashtable_insert_fast(&lmv->lmv_qos_exclude_hash,
+                                   &prefix->qep_hash, qos_exclude_hash_params);
+       if (rc) {
+               kfree(prefix);
+               GOTO(out, rc);
+       }
+
+       list_add_tail(&prefix->qep_list, &lmv->lmv_qos_exclude_list);
+       GOTO(out, rc);
+out:
+       if (rc)
+               rhashtable_destroy(&lmv->lmv_qos_exclude_hash);
+       return rc;
 }
 
 static int lmv_cleanup(struct obd_device *obd)
@@ -1271,6 +1332,8 @@ static int lmv_cleanup(struct obd_device *obd)
 
        ENTRY;
 
+       rhashtable_free_and_destroy(&lmv->lmv_qos_exclude_hash,
+                                   qos_exclude_prefix_free, NULL);
        fld_client_fini(&lmv->lmv_fld);
        fld_client_debugfs_fini(&lmv->lmv_fld);
 
@@ -2046,6 +2109,37 @@ static struct lu_tgt_desc *lmv_locate_tgt_by_space(struct lmv_obd *lmv,
        return tgt;
 }
 
+static bool lmv_qos_exclude(struct lmv_obd *lmv, struct md_op_data *op_data)
+{
+       const char *name = op_data->op_name;
+       size_t namelen = op_data->op_namelen;
+       char buf[NAME_MAX + 1];
+       struct qos_exclude_prefix *prefix;
+       char *p;
+
+       /* skip encrypted files */
+       if (op_data->op_file_encctx)
+               return false;
+
+       /* name length may not be validated yet */
+       if (namelen > NAME_MAX)
+               return false;
+
+       p = strrchr(name, '.');
+       if (p) {
+               namelen = p - name;
+               if (!namelen)
+                       return false;
+               strncpy(buf, name, namelen);
+               buf[namelen] = '\0';
+               name = buf;
+       }
+
+       prefix = rhashtable_lookup_fast(&lmv->lmv_qos_exclude_hash, name,
+                                       qos_exclude_hash_params);
+       return prefix != NULL;
+}
+
 static int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
                      const void *data, size_t datalen, umode_t mode, uid_t uid,
                      gid_t gid, kernel_cap_t cap_effective, __u64 rdev,
@@ -2112,7 +2206,8 @@ static int lmv_create(struct obd_export *exp, struct md_op_data *op_data,
                        RETURN(-ENODEV);
                if (unlikely(tgt->ltd_statfs.os_state & OS_STATFS_NOCREATE))
                        GOTO(new_tgt, -EAGAIN);
-       } else if (lmv_op_default_qos_mkdir(op_data) ||
+       } else if ((lmv_op_default_qos_mkdir(op_data) &&
+                   !lmv_qos_exclude(lmv, op_data)) ||
                   tgt->ltd_statfs.os_state & OS_STATFS_NOCREATE) {
 new_tgt:
                tgt = lmv_locate_tgt_by_space(lmv, op_data, tgt);
index c2a5db2..4aa4583 100644 (file)
@@ -198,6 +198,151 @@ static ssize_t qos_threshold_rr_store(struct kobject *kobj,
 LUSTRE_RW_ATTR(qos_threshold_rr);
 
 #ifdef CONFIG_PROC_FS
+/* directories with exclude prefixes will be created on the same MDT as its
+ * parent directory, the prefixes are set with the rule as shell environment
+ * PATH: ':' is used as separator for prefixes. And for convenience, '+/-' is
+ * used to add/remove prefixes.
+ */
+static int qos_exclude_prefixes_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+       struct lmv_obd *lmv = &obd->u.lmv;
+       struct qos_exclude_prefix *prefix;
+
+restart:
+       spin_lock(&lmv->lmv_lock);
+       list_for_each_entry(prefix, &lmv->lmv_qos_exclude_list, qep_list) {
+               seq_printf(m, "%s\n", prefix->qep_name);
+               if (seq_has_overflowed(m)) {
+                       spin_unlock(&lmv->lmv_lock);
+                       kvfree(m->buf);
+                       m->count = 0;
+                       m->buf = kvmalloc(m->size <<= 1, GFP_KERNEL_ACCOUNT);
+                       if (!m->buf)
+                               return -ENOMEM;
+                       goto restart;
+               }
+       }
+       spin_unlock(&lmv->lmv_lock);
+
+       return 0;
+}
+
+static ssize_t qos_exclude_prefixes_seq_write(struct file *file,
+                                             const char __user *buffer,
+                                             size_t count, loff_t *off)
+{
+       struct obd_device *obd;
+       struct lmv_obd *lmv;
+       char *buf;
+       char op = 0;
+       char *p;
+       char *name;
+       char namebuf[NAME_MAX + 1];
+       struct qos_exclude_prefix *prefix;
+       struct qos_exclude_prefix *tmp;
+       int len;
+       bool pruned = false;
+       int rc;
+
+       /* one extra char to ensure buf ends with '\0' */
+       OBD_ALLOC(buf, count + 1);
+       if (!buf)
+               return -ENOMEM;
+       if (copy_from_user(buf, buffer, count)) {
+               OBD_FREE(buf, count + 1);
+               return -EFAULT;
+       }
+
+       obd = ((struct seq_file *)file->private_data)->private;
+       lmv = &obd->u.lmv;
+       p = buf;
+       while (p) {
+               while (*p == ':')
+                       p++;
+               if (*p == '\0')
+                       break;
+               if (*p == '+' || *p == '-')
+                       op = *p++;
+
+               name = p;
+               p = strchr(name, ':');
+               if (p)
+                       len = p - name;
+               else
+                       len = strlen(name);
+               if (!len)
+                       break;
+               if (len > NAME_MAX) {
+                       CERROR("%s: %s length exceeds NAME_MAX\n",
+                              obd->obd_name, name);
+                       OBD_FREE(buf, count + 1);
+                       return -ERANGE;
+               }
+
+               switch (op) {
+               default:
+                       if (!pruned) {
+                               spin_lock(&lmv->lmv_lock);
+                               list_for_each_entry_safe(prefix, tmp,
+                                               &lmv->lmv_qos_exclude_list,
+                                               qep_list) {
+                                       list_del(&prefix->qep_list);
+                                       rhashtable_remove_fast(
+                                               &lmv->lmv_qos_exclude_hash,
+                                               &prefix->qep_hash,
+                                               qos_exclude_hash_params);
+                                       kfree(prefix);
+                               }
+                               spin_unlock(&lmv->lmv_lock);
+                               pruned = true;
+                       }
+                       fallthrough;
+               case '+':
+                       prefix = kmalloc(sizeof(*prefix), __GFP_ZERO);
+                       if (!prefix) {
+                               OBD_FREE(buf, count + 1);
+                               return -ENOMEM;
+                       }
+                       strncpy(prefix->qep_name, name, len);
+                       rc = rhashtable_lookup_insert_fast(
+                                               &lmv->lmv_qos_exclude_hash,
+                                               &prefix->qep_hash,
+                                               qos_exclude_hash_params);
+                       if (!rc) {
+                               spin_lock(&lmv->lmv_lock);
+                               list_add_tail(&prefix->qep_list,
+                                             &lmv->lmv_qos_exclude_list);
+                               spin_unlock(&lmv->lmv_lock);
+                       } else {
+                               kfree(prefix);
+                       }
+                       break;
+               case '-':
+                       strncpy(namebuf, name, len);
+                       namebuf[len] = '\0';
+                       prefix = rhashtable_lookup(&lmv->lmv_qos_exclude_hash,
+                                                  namebuf,
+                                                  qos_exclude_hash_params);
+                       if (prefix) {
+                               spin_lock(&lmv->lmv_lock);
+                               list_del(&prefix->qep_list);
+                               spin_unlock(&lmv->lmv_lock);
+                               rhashtable_remove_fast(
+                                               &lmv->lmv_qos_exclude_hash,
+                                               &prefix->qep_hash,
+                                               qos_exclude_hash_params);
+                               kfree(prefix);
+                       }
+                       break;
+               }
+       }
+
+       OBD_FREE(buf, count + 1);
+       return count;
+}
+LPROC_SEQ_FOPS(qos_exclude_prefixes);
+
 static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos)
 {
        struct obd_device *obd = p->private;
@@ -278,6 +423,14 @@ static const struct proc_ops lmv_proc_target_fops = {
        .proc_lseek     = seq_lseek,
        .proc_release   = seq_release,
 };
+
+struct lprocfs_vars lprocfs_lmv_obd_vars[] = {
+       { .name =       "qos_exclude_prefixes",
+         .fops =       &qos_exclude_prefixes_fops },
+       { .name =       "target_obd",
+         .fops =       &lmv_proc_target_fops },
+       { NULL }
+};
 #endif /* CONFIG_PROC_FS */
 
 static struct attribute *lmv_attrs[] = {
@@ -297,6 +450,9 @@ int lmv_tunables_init(struct obd_device *obd)
        int rc;
 
        obd->obd_ktype.default_groups = KOBJ_ATTR_GROUPS(lmv);
+#ifdef CONFIG_PROC_FS
+       obd->obd_vars = lprocfs_lmv_obd_vars;
+#endif
        rc = lprocfs_obd_setup(obd, true);
        if (rc)
                goto out_failed;
@@ -306,16 +462,6 @@ int lmv_tunables_init(struct obd_device *obd)
                lprocfs_obd_cleanup(obd);
                goto out_failed;
        }
-
-       rc = lprocfs_seq_create(obd->obd_proc_entry, "target_obd",
-                               0444, &lmv_proc_target_fops, obd);
-       if (rc) {
-               lprocfs_free_md_stats(obd);
-               lprocfs_obd_cleanup(obd);
-               CWARN("%s: error adding LMV target_obd file: rc = %d\n",
-                     obd->obd_name, rc);
-               rc = 0;
-       }
 #endif /* CONFIG_PROC_FS */
 out_failed:
        return rc;
index f838810..fed7896 100755 (executable)
@@ -29447,6 +29447,43 @@ test_413j()
 }
 run_test 413j "set default LMV by setxattr"
 
+test_413k() {
+       (( $MDS1_VERSION >= $(version_code 2.15.60) )) ||
+               skip "Need server version at least 2.15.60"
+
+       local index1
+       local index2
+       local old=$($LCTL get_param -n lmv.*.qos_exclude_prefixes)
+       local count=$($LCTL get_param -n lmv.*.qos_exclude_prefixes | wc -l)
+       local prefixes="abc:123:foo bar"
+
+       # add prefixes
+       stack_trap "$LCTL set_param lmv.*.qos_exclude_prefixes=\"$old\""
+       $LCTL set_param lmv.*.qos_exclude_prefixes="+$prefixes"
+
+       mkdir $DIR/$tdir || error "mkdir $tdir failed"
+       index1=$($LFS getstripe -m $DIR/$tdir)
+       for dname in _temporary _temporary.XXXXXX abc 123 "foo bar"; do
+               mkdir "$DIR/$tdir/$dname" || error "mkdir $dname failed"
+               index2=$($LFS getstripe -m "$DIR/$tdir/$dname")
+               ((index1 == index2)) ||
+                       error "$tdir on MDT$index1, $dname on MDT$index2"
+       done
+
+       # remove prefixes
+       $LCTL set_param lmv.*.qos_exclude_prefixes="-$prefixes"
+
+       # total prefixes length > PAGE_SIZE can be printed correctly
+       for c in {a..z}; do
+               prefixes=$(str_repeat $c 255)
+               $LCTL set_param lmv.*.qos_exclude_prefixes="+$prefixes" >/dev/null
+       done
+       local count2=$($LCTL get_param -n lmv.*.qos_exclude_prefixes | wc -l)
+       ((count2 == count + 26)) ||
+               error "prefixes count $count2 != $((count + 26))"
+}
+run_test 413k "QoS mkdir exclude prefixes"
+
 test_413z() {
        local pids=""
        local subdir