Whamcloud - gitweb
EX-4015 lipe: add fid2path cache to lipe_scan3
authorJohn L. Hammond <jhammond@whamcloud.com>
Fri, 11 Feb 2022 17:05:15 +0000 (11:05 -0600)
committerJohn L. Hammond <jhammond@whamcloud.com>
Thu, 10 Mar 2022 17:24:25 +0000 (17:24 +0000)
Add a thread local directory fid2path cache to lipe_scan3. Without the
cache, as single scanning thread could expect to do about 3K fid2path
operations per second. After the cache the rate improves to about
70K. We set the max cache size to 1024 FIDs and use LRU to reclaim
slots. Based on this a full cache will use about 4MB of memory per
thread.

Test-Parameters: testlist=sanity-lipe-scan3 facet=mds1
Signed-off-by: John L. Hammond <jhammond@whamcloud.com>
Change-Id: I8a022665de78e6b599f2b4c4f1e2b7400d4d8ffe
Reviewed-on: https://review.whamcloud.com/46509
Tested-by: jenkins <devops@whamcloud.com>
lipe/src/lipe_scan3/Makefile.am
lipe/src/lipe_scan3/ls3_fid2path.c [new file with mode: 0644]
lipe/src/lipe_scan3/ls3_fid2path.h [new file with mode: 0644]
lipe/src/lipe_scan3/ls3_main.c
lipe/src/lipe_scan3/ls3_object_attrs.c
lipe/src/lipe_scan3/ls3_object_attrs.h
lipe/src/lipe_scan3/ls3_scan.c
lustre/tests/sanity-lipe-scan3.sh

index 8d76226..b4c3643 100644 (file)
@@ -16,6 +16,8 @@ lipe_scan3_SOURCES = \
        ../lipe_version.h \
        ../list.h \
        ls3_debug.h \
+       ls3_fid2path.c \
+       ls3_fid2path.h \
        ls3_main.c \
        ls3_object_attrs.c \
        ls3_object_attrs.h \
diff --git a/lipe/src/lipe_scan3/ls3_fid2path.c b/lipe/src/lipe_scan3/ls3_fid2path.c
new file mode 100644 (file)
index 0000000..9262cf7
--- /dev/null
@@ -0,0 +1,223 @@
+#include "ls3_fid2path.h"
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <linux/lustre/lustre_fid.h>
+#include <linux/lustre/lustre_idl.h>
+#include <linux/lustre/lustre_ioctl.h>
+#include "list.h"
+#include "ls3_debug.h"
+
+/* Thread local fid2path cache. Created on demand on first call to
+ * ls3_fid2path(). XXX Must be destroyed by explicit call to
+ * ls3_fid2path_cache_fini(). We assume that it is only used for
+ * directories since we only cache one path per FID. This is used to
+ * speedup normal fid2path on MDTs by using the link xattr to get the
+ * parent FIDs and names and then using the cache to get the paths of
+ * parents FIDs.
+ *
+ * XXX Not for caching paths of arbitrary files. It won't work
+ * properly and will defeat the benefits by polluting the cache.
+ *
+ * We assume that it is only used for a single FS. This is how we use
+ * it now.
+ *
+ * TODO Move this out of thread local storage and into scanning thread
+ * info.
+ */
+static __thread struct f2p_cache *ls3_fid2path_cache;
+
+enum {
+       LS3_FID2PATH_CACHE_HASH_SHIFT = 10,
+       LS3_FID2PATH_CACHE_COUNT_MAX = 1UL << LS3_FID2PATH_CACHE_HASH_SHIFT,
+};
+
+#define container_of(ptr, type, member) ({                      \
+       const typeof(((type *) 0)->member) * __mptr = (ptr);     \
+       (type *) ((char *) __mptr - offsetof(type, member)); })
+
+struct f2p_node {
+       struct lipe_list_head fn_lru_link;
+       struct hlist_node fn_node;
+       struct lu_fid fn_fid;
+       int fn_rc;
+       char fn_path[];
+};
+
+struct f2p_cache {
+       struct lipe_list_head fc_lru_list;
+       struct hlist_head *fc_heads;
+       size_t fc_hash_mask;
+       size_t fc_node_count;
+       size_t fc_node_count_max;
+};
+
+static struct f2p_node *f2p_node_create(const struct lu_fid *fid, int rc, const char *path)
+{
+       size_t fn_size;
+       struct f2p_node *fn;
+
+       assert((rc == 0) == (path != NULL));
+
+       if (path != NULL)
+               fn_size = sizeof(*fn) + strlen(path) + 1;
+       else
+               fn_size = sizeof(*fn) + 1; /* + 1 for '\0' in fn_path[0] */
+
+       fn = xcalloc(1, fn_size);
+       LIPE_INIT_LIST_HEAD(&fn->fn_lru_link);
+       INIT_HLIST_NODE(&fn->fn_node);
+       fn->fn_fid = *fid;
+       fn->fn_rc = rc;
+
+       if (path != NULL)
+               strcpy(fn->fn_path, path);
+
+       return fn;
+}
+
+static void f2p_cache_free(struct f2p_cache *fc)
+{
+       struct f2p_node *fn, *tmp;
+
+       if (fc == NULL)
+               return;
+
+       lipe_list_for_each_entry_safe(fn, tmp, &fc->fc_lru_list, fn_lru_link) {
+               lipe_list_del(&fn->fn_lru_link);
+               free(fn);
+       }
+
+       assert(lipe_list_empty(&fc->fc_lru_list));
+       free(fc->fc_heads);
+       free(fc);
+}
+
+static int f2p_cache_create(struct f2p_cache **pfc, size_t hash_shift, size_t count_max)
+{
+       struct f2p_cache *fc;
+       size_t head_count;
+
+       assert(hash_shift > 1);
+       assert(count_max > 1);
+
+       fc = xcalloc(1, sizeof(*fc));
+
+       LIPE_INIT_LIST_HEAD(&fc->fc_lru_list);
+
+       head_count = (1UL << hash_shift);
+       fc->fc_hash_mask = head_count - 1;
+
+       fc->fc_heads = xcalloc(head_count, sizeof(fc->fc_heads[0]));
+       fc->fc_node_count_max = count_max;
+
+       *pfc = fc;
+       fc = NULL;
+       f2p_cache_free(fc);
+
+       return 0;
+}
+
+static int f2p_cache_find(char **ppath, struct f2p_cache *fc, int client_mount_fd, const struct lu_fid *fid)
+{
+       size_t hash;
+       struct hlist_head *head;
+       struct getinfo_fid2path *gf = NULL;
+       unsigned int pathlen = PATH_MAX;
+       struct f2p_node *fn;
+       struct hlist_node *pos;
+       int rc;
+
+       *ppath = NULL;
+
+       hash = lustre_hash_fnv_1a_64(fid, sizeof(*fid));
+       head = &fc->fc_heads[hash & fc->fc_hash_mask];
+
+       /* This does not handle paths longer than PATH_MAX (4096). It
+        * wouldn't be easy to do so since ll_fid2path() will fail
+        * with -EINVAL if we use a buffer size (pathlen/gf_pathlen)
+        * greater than PATH_MAX. We could try working up the
+        * directory hierarch until. */
+
+       hlist_for_each_entry(fn, pos, head, fn_node)
+               if (lu_fid_eq(fid, &fn->fn_fid))
+                       goto out_found;
+
+       fn = NULL;
+
+       gf = xcalloc(1, sizeof(*gf) + pathlen);
+       gf->gf_fid = *fid;
+       gf->gf_pathlen = pathlen;
+
+       rc = ioctl(client_mount_fd, OBD_IOC_FID2PATH, gf);
+       if (rc < 0) {
+               fn = f2p_node_create(fid, -errno, NULL);
+       } else {
+               fn = f2p_node_create(fid, 0, gf->gf_u.gf_path);
+       }
+
+       /* XXX The path returned by OBD_IOC_FID2PATH may contain
+        * double slashes due some quirks in fid2path for striped
+        * directories. (It's still correct.)
+        *
+        * We strip extra slashes in lipe_object_attrs_add_path(). */
+
+       hlist_add_head(&fn->fn_node, head);
+       fc->fc_node_count++;
+
+       assert(fc->fc_node_count_max > 1);
+
+       while (fc->fc_node_count > fc->fc_node_count_max) {
+               struct f2p_node *fn2;
+
+               assert(!lipe_list_empty(&fc->fc_lru_list));
+               fn2 = lipe_list_entry(fc->fc_lru_list.next, struct f2p_node, fn_lru_link);
+               lipe_list_del(&fn2->fn_lru_link);
+               hlist_del(&fn2->fn_node);
+               free(fn2);
+               fc->fc_node_count--;
+       }
+
+out_found:
+       lipe_list_move_tail(&fn->fn_lru_link, &fc->fc_lru_list);
+       rc = fn->fn_rc;
+       if (rc < 0)
+               goto out;
+
+       *ppath = xstrdup(fn->fn_path);
+out:
+       free(gf);
+
+       return rc;
+}
+
+int ls3_fid2path(char **path, int client_mount_fd, const struct lu_fid *fid)
+{
+       int rc;
+
+       if (ls3_fid2path_cache == NULL) {
+               rc = f2p_cache_create(&ls3_fid2path_cache,
+                                     LS3_FID2PATH_CACHE_HASH_SHIFT,
+                                     LS3_FID2PATH_CACHE_COUNT_MAX);
+
+               /* As written f2p_cache_create() will not return an
+                * error.  If it did (because we enabled tuning or
+                * something or optional disabling) then we should fix
+                * things so that the caller will fallback to the
+                * usual fid2path. */
+               assert(rc == 0);
+       }
+
+       assert(ls3_fid2path_cache != NULL);
+
+       return f2p_cache_find(path, ls3_fid2path_cache, client_mount_fd, fid);
+}
+
+void ls3_fid2path_cache_fini(void)
+{
+       f2p_cache_free(ls3_fid2path_cache);
+       ls3_fid2path_cache = NULL;
+}
diff --git a/lipe/src/lipe_scan3/ls3_fid2path.h b/lipe/src/lipe_scan3/ls3_fid2path.h
new file mode 100644 (file)
index 0000000..a1a1eb8
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef _LS3_FID2PATH_H_
+#define _LS3_FID2PATH_H_
+
+#include <stddef.h>
+
+struct lu_fid;
+
+int ls3_fid2path(char **path, int client_mount_fd, const struct lu_fid *fid);
+void ls3_fid2path_cache_fini(void);
+
+#endif /* _LS3_FID2PATH_H_ */
index 0d0b005..29b3edb 100644 (file)
@@ -1115,6 +1115,9 @@ static void ls3_main_scm(void *data, int argc, char *argv[])
 
        while ((c = getopt_long(argc, argv, "hil:s:v", options, NULL)) != EOF) {
                switch (c) {
+               case LS3_OPT_ALL_PATHS:
+                       print_all_paths = true;
+                       break;
                case LS3_OPT_CLIENT_MOUNT:
                        ls3_client_mount_path = optarg;
                        break;
index c30f11c..8c84dc8 100644 (file)
@@ -514,26 +514,16 @@ int lipe_object_attrs_set_links(struct ls3_object_attrs *attrs,
        return 0;
 }
 
-int lipe_object_attrs_add_path(struct ls3_object_attrs *attrs,
-                              const char *path)
-{
-       struct lipe_path_entry *lpe = NULL;
-
-       lpe = xcalloc(1, sizeof(*lpe));
-       lpe->lpe_path = xstrdup(path);
-
-       lipe_list_add_tail(&lpe->lpe_linkage, &attrs->loa_paths);
-
-       return 0;
-}
-
 /* Fixup DNE striped directory path with '//'. Root => "". Does not
  * return "/" for root. See also copy_strip_dne_path(). */
 static void lipe_fid2path_fixup(char *path)
 {
-       char *d, *s;
+       char *d = path, *s = path;
+
+       while (*s == '/')
+               s++;
 
-       for (d = path, s = path; *s != '\0'; s++) {
+       for (; *s != '\0'; s++) {
                if (*s == '/' && *(s + 1) == '/')
                        continue;
 
@@ -543,6 +533,20 @@ static void lipe_fid2path_fixup(char *path)
        *d = '\0';
 }
 
+int lipe_object_attrs_add_path(struct ls3_object_attrs *attrs,
+                              const char *path)
+{
+       struct lipe_path_entry *lpe = NULL;
+
+       lpe = xcalloc(1, sizeof(*lpe));
+       lpe->lpe_path = xstrdup(path);
+       lipe_fid2path_fixup(lpe->lpe_path);
+
+       lipe_list_add_tail(&lpe->lpe_linkage, &attrs->loa_paths);
+
+       return 0;
+}
+
 int lipe_object_attrs_set_paths(struct ls3_object_attrs *loa,
                               int client_mount_fd)
 {
@@ -583,8 +587,6 @@ int lipe_object_attrs_set_paths(struct ls3_object_attrs *loa,
                        goto out;
                }
 
-               lipe_fid2path_fixup(gf->gf_u.gf_path);
-
                rc = lipe_object_attrs_add_path(loa, gf->gf_u.gf_path);
                if (rc < 0)
                        goto out;
index 5d6a920..c86147c 100644 (file)
@@ -161,7 +161,8 @@ struct ls3_object_attrs {
        int64_t                  loa_projid;
        struct lu_fid            loa_file_fid;
        struct lu_fid            loa_self_fid;
-
+       uint32_t                 loa_lma_compat;
+       uint32_t                 loa_lma_incompat;
        char                     loa_leh_buf[XATTR_SIZE_MAX];
        char                     loa_lmv_buf[XATTR_SIZE_MAX];
        struct lov_user_md      *loa_lum;
index b5a3a38..4794fc8 100644 (file)
@@ -9,6 +9,7 @@
  */
 #include "ls3_scan.h"
 #include <stdbool.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
@@ -26,6 +27,7 @@
 #include <com_err.h>
 #include <ext2fs/ext2fs.h>
 #include "ls3_debug.h"
+#include "ls3_fid2path.h"
 #include "ls3_object_attrs.h"
 
 /* XXX We are mixing libext2fs errcode_t (long), pthread positive rcs,
@@ -117,6 +119,8 @@ ldiskfs_read_attr_self_fid(struct ls3_instance *li,
 
        lma = (struct lustre_mdt_attrs *)buf;
        fid_le_to_cpu(&loa->loa_self_fid, &lma->lma_self_fid);
+       loa->loa_lma_compat = ext2fs_le32_to_cpu(lma->lma_compat);
+       loa->loa_lma_incompat = ext2fs_le32_to_cpu(lma->lma_incompat);
        loa->loa_attr_bits |= LS3_OBJECT_ATTR_SELF_FID;
 
        return 0;
@@ -202,6 +206,20 @@ ldiskfs_read_attr_links(struct ls3_instance *li,
        if (loa->loa_attr_bits & LS3_OBJECT_ATTR_LINKS)
                return 0;
 
+       assert(lipe_list_empty(&loa->loa_links));
+
+       /* We use the link xattr plus caching to speed up paths but we
+        * this is tricky because the link xattr is not always updated
+        * properly after unlink. This is why we test
+        * LMAI_ORPHAN. */
+
+       rc = ldiskfs_read_attr_self_fid(li, lo, loa);
+       if (rc < 0)
+               return rc;
+
+       if (loa->loa_lma_incompat & LMAI_ORPHAN)
+               goto out_ok;
+
        rc = ext2fs_attr_get(fs, (struct ext2_inode *)inode,
                             EXT2_ATTR_INDEX_TRUSTED,
                             XATTR_NAME_LINK + strlen("trusted."),
@@ -218,38 +236,88 @@ ldiskfs_read_attr_links(struct ls3_instance *li,
                LS3_ERROR_OBJ(lo, "cannot decode link xattr: rc = %ld\n", rc);
                return rc;
        }
-
+out_ok:
        loa->loa_attr_bits |= LS3_OBJECT_ATTR_LINKS;
 
        return 0;
 }
 
+static char *xsprintf(const char *fmt, ...)
+{
+       char *str = NULL;
+       va_list ap;
+       int rc;
+
+       va_start(ap, fmt);
+       rc = vasprintf(&str, fmt, ap);
+       va_end(ap);
+       assert(!(rc < 0) && str != NULL);
+
+       return str;
+}
+
 static int
 ldiskfs_read_attr_paths(struct ls3_instance *li,
                        struct lipe_object *lo,
                        struct ls3_object_attrs *loa)
 {
+       struct lipe_link_entry *lle;
        int rc;
 
        if (loa->loa_attr_bits & LS3_OBJECT_ATTR_PATHS)
                return 0;
 
-       rc = ldiskfs_read_attr_file_fid(li, lo, loa);
-       if (rc < 0)
-               return rc;
+       assert(lipe_list_empty(&loa->loa_paths));
 
-       if (li->li_device_is_mdt) {
-               /* We cannot use links to make paths faster because
-                * the linkea is not updated properly after
-                * unlink. But requiring a link xattr before fid2path
-                * prevents MDT crashes when we pass fids of OI_scrub
-                * or other internal files. */
-               rc = ldiskfs_read_attr_links(li, lo, loa);
+       if (li->li_device_is_ost) {
+               rc = ldiskfs_read_attr_file_fid(li, lo, loa);
                if (rc < 0)
                        return rc;
+
+               return lipe_object_attrs_set_paths(loa, li->li_client_mount_fd);
+       }
+
+       /* Requiring a link xattr before fid2path prevents MDT crashes
+        * when we pass fids of OI_scrub or other internal files. */
+
+       rc = ldiskfs_read_attr_links(li, lo, loa);
+       if (rc < 0)
+               return rc;
+
+       if (lipe_list_empty(&loa->loa_links)) {
+               /* This must be a pending/orphan file. */
+               goto out_ok;
        }
 
-       return lipe_object_attrs_set_paths(loa, li->li_client_mount_fd);
+       /* TODO Add a way to tune or disable fid2path caching. */
+
+       rc = 0;
+       lipe_list_for_each_entry(lle, &loa->loa_links, lle_linkage) {
+               char *parent_path = NULL;
+               char *path = NULL;
+               int rc2;
+
+               rc2 = ls3_fid2path(&parent_path, li->li_client_mount_fd, &lle->lle_parent_fid);
+               if (rc2 < 0) {
+                       assert(parent_path == NULL);
+                       rc = rc2;
+                       continue;
+               }
+
+               path = xsprintf("%s/%s", parent_path, lle->lle_name);
+               lipe_object_attrs_add_path(loa, path);
+               free(parent_path);
+               free(path);
+       }
+
+       if (lipe_list_empty(&loa->loa_paths)) {
+               assert(rc < 0);
+               return rc;
+       }
+out_ok:
+       loa->loa_attr_bits |= LS3_OBJECT_ATTR_PATHS;
+
+       return 0;
 }
 
 static int ldiskfs_copy_xattr(char *name, char *value, size_t value_len,
@@ -833,6 +901,8 @@ out_close:
 out_free_attrs:
        lipe_object_attrs_fini(&loa);
 out:
+       ls3_fid2path_cache_fini();
+
        LS3_DEBUG_D(rc);
 
        return (void *)rc;
index c050b80..61a7248 100644 (file)
@@ -1,3 +1,4 @@
+
 #!/bin/bash
 #
 # Tests for lipe_find and lipe_scan.
@@ -704,7 +705,7 @@ test_300() {
        init_lipe_scan3_env_file "$file"
        fid=$($LFS path2fid "$file")
 
-lipe_scan3 "$device" --print-file-fid
+       lipe_scan3 "$device" --print-file-fid
 
        out=$(lipe_scan3 "$device" --print-file-fid)
        [[ "$out" == "$fid" ]] || error "--print-file-fid should print '$fid'"
@@ -755,12 +756,104 @@ test_302() {
        out=$(lipe_scan3 "$device" --print-relative-path)
        [[ "$out" == "$tfile" ]] || error "--print-relative-path should print relative path"
 
-       # TODO --all-paths
        # TODO --null
        # TODO --delim
 }
 run_test 302 "--print-*-path options work"
 
+test_303() {
+       local facet=mds1
+       local device="$(facet_device $facet)"
+       local tmp1=$(mktemp)
+       local tmp2=$(mktemp)
+
+       init_lipe_scan3_env
+
+       $LFS mkdir -c $MDSCOUNT $MOUNT/$tdir
+       $LFS mkdir -c $MDSCOUNT $MOUNT/$tdir/d{0..3}
+       $LFS mkdir -c 1 -i 0 $MOUNT/$tdir/d{0..3}/d{0..3}
+       touch $MOUNT/$tdir/d{0..3}/d{0..3}/f{0..3}
+       sync
+
+       # XXX lipe_scan3 does not return the ROOT
+       (echo "$MOUNT"; lipe_scan3 "${device}" --print-absolute-path) | sort > $tmp1
+       lfs find "$MOUNT" --mdt-index 0 | sort > $tmp2
+       diff $tmp1 $tmp2 || error "--print-absolute-path should print the right paths"
+}
+run_test 303 "--print-absolute-paths prints the right paths"
+
+test_304() {
+       local facet=mds1
+       local device="$(facet_device $facet)"
+       local file=$MOUNT/$tfile
+       local out
+       local expect
+
+       init_lipe_scan3_env_file "$file"
+
+       ln "$file" "$file-1"
+       ln "$file" "$file-2"
+       sync
+
+       out=$(lipe_scan3 "${device}" --print-absolute-path)
+       [[ "$out" == "$file" ]] || error "print-absolute-path got '$out', expected '$file"
+
+       out=$(lipe_scan3 "${device}" --print-absolute-path --all-paths | sort)
+       expect=$(ls "$file"*)
+
+       [[ "$out" == "$expect" ]] || error "print-absolute-path got '$out', expected '$expect'"
+
+       out=$(lipe_scan3 "${device}" --print-relative-path)
+       [[ "$out" == "$tfile" ]] || error "print-absolute-path got '$out', expected '$tfile"
+
+       out=$(lipe_scan3 "${device}" --print-relative-path --all-paths | sort)
+       expect=$(ls "$MOUNT")
+       [[ "$out" == "$expect" ]] || error "print-relative-path got '$out', expected '$expect'"
+}
+run_test 304 "print-*-paths with multiple links"
+
+test_305() {
+       local facet=mds1
+       local device="$(facet_device $facet)"
+       local file
+       local fid
+       local mdt_index
+       local out
+
+       init_lipe_scan3_env
+
+       $LFS mkdir -c $MDSCOUNT $MOUNT/$tdir
+       $LFS mkdir -c $MDSCOUNT $MOUNT/$tdir/d0
+       $LFS mkdir -c 1 -i 0 $MOUNT/$tdir/d0/d0
+       file=$MOUNT/$tdir/d0/d0/f0
+       touch $file
+       fid=$($LFS path2fid $file)
+       mdt_index=$($LFS getstripe --mdt-index $file)
+       ((mdt_index == 0)) || error "expected MDT index 0, got '$mdt_index'"
+       sync
+
+       out=$(lipe_scan3 "${device}" --print-json=file_fid,paths |
+       jq --raw-output --arg FID "$fid" 'select(.file_fid == $FID) | .paths[]')
+       [[ "$out" == "$tdir/d0/d0/f0" ]] || error "JSON got '$out', expected '$tdir/d0/d0/f0'"
+
+       ln "$file" "$file"-1
+       ln "$file" "$file"-2
+       sync
+
+       out=$(lipe_scan3 "${device}" --print-json=file_fid,paths |
+       jq --raw-output --arg FID "$fid" 'select(.file_fid == $FID) | .paths[0]')
+       [[ "$out" == "$tdir/d0/d0/f0" ]] || error "JSON got '$out', expected '$tdir/d0/d0/f0'"
+
+       out=$(lipe_scan3 "${device}" --print-json=file_fid,paths |
+       jq --raw-output --arg FID "$fid" 'select(.file_fid == $FID) | .paths[1]')
+       [[ "$out" == "$tdir/d0/d0/f0-1" ]] || error "JSON got '$out', expected '$tdir/d0/d0/f0-1'"
+
+       out=$(lipe_scan3 "${device}" --print-json=file_fid,paths |
+       jq --raw-output --arg FID "$fid" 'select(.file_fid == $FID) | .paths[2]')
+       [[ "$out" == "$tdir/d0/d0/f0-2" ]] || error "JSON got '$out', expected '$tdir/d0/d0/f0-2'"
+}
+run_test 305 "print-json prints the right paths"
+
 # loading and scripts
 
 test_400() {