Whamcloud - gitweb
LU-14361 statahead: regularized fname statahead pattern 08/41308/40
authorQian Yingjin <qian@ddn.com>
Mon, 10 Oct 2022 08:17:31 +0000 (04:17 -0400)
committerOleg Drokin <green@whamcloud.com>
Wed, 3 Jan 2024 03:02:07 +0000 (03:02 +0000)
Some applications do stat() calls under a directory within which
all children files have regularized file name:
- mdtest benchmark tool: mdtest.$rank.$i
- ML/AI with ingested data that have typically a format rule of
  the filename in the directory.

The most common format for regularized file name is that the
suffix part of the file name is number-indexing.
However, in the current statahead mechanism, the statahead is
populated by the order of the hash of the file name via readdir()
calls, not a kind of sorting order.

In this patch, we improve the statahead to prefetch attributes for
the files with regularized indexing file name via asynchronous
batching RPC.

This patch adds the support to do statahead for these kinds of
applications, which can be optimized, but without opendir()/
close() to start/stop statahead thread explicitly.

Instead, the statahead thread will stop and quit when found
that there was no acitivy for more than a certain time period
(i.e. 30 seconds).

Test-Parameters: mdtcount=4 mdscount=2 testlist=sanity env=ONLY=27p,ONLY_REPEAT=5
Test-Parameters: mdtcount=4 mdscount=2 testlist=sanity env=ONLY=27p,ONLY_REPEAT=5
Test-Parameters: mdtcount=4 mdscount=2 testlist=sanity env=ONLY=123f,ONLY_REPEAT=10
Test-Parameters: mdtcount=4 mdscount=2 testlist=sanity env=ONLY=123f,ONLY_REPEAT=10
Signed-off-by: Qian Yingjin <qian@ddn.com>
Change-Id: Ide11ec5a651ae74884ddbe1cecede4f5c961e38d
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/41308
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/llite/file.c
lustre/llite/llite_internal.h
lustre/llite/llite_lib.c
lustre/llite/lproc_llite.c
lustre/llite/statahead.c
lustre/ptlrpc/layout.c
lustre/tests/sanity.sh

index 83a9d07..dcc8a93 100644 (file)
@@ -5509,6 +5509,7 @@ int ll_getattr_dentry(struct dentry *de, struct kstat *stat, u32 request_mask,
              request_mask & STATX_MTIME))
                need_glimpse = false;
 
+       ll_statahead_enter(dir, de);
        if (dentry_may_statahead(dir, de))
                ll_start_statahead(dir, de, need_glimpse &&
                                   !(flags & AT_STATX_DONT_SYNC));
index e52d49f..399e69a 100644 (file)
@@ -119,9 +119,28 @@ struct ll_trunc_sem {
 };
 
 enum ll_sa_pattern {
-       LSA_PATTERN_NONE        = 0x0000,
-       LSA_PATTERN_LIST        = 0x0001,
-       LSA_PATTERN_FNAME       = 0X0002,
+       LSA_PATTERN_NONE                = 0x0000,
+       /* once detected, and found no statahead pattern matched */
+       LSA_PATTERN_INVALID             = 0x0001,
+       /* do the directory listing. i.e ls $dir */
+       LSA_PATTERN_LIST                = 0x0002,
+       /* regularized file name scanning, i.e. mdtest.$i */
+       LSA_PATTERN_FNAME               = 0x0004,
+       /* statahead advise via statahead hint from users */
+       LSA_PATTERN_ADVISE              = 0x0008,
+       /* not first dirent, or is "." for listing */
+       LSA_PATTERN_LS_NOT_FIRST_DE     = 0x0100,
+       /* the file names of stat() calls has regularized predictable format */
+       LSA_PATTERN_FN_PREDICT          = 0x1000,
+       /* fname statahead workload similar to mdtest shared dir stat() */
+       LSA_PATTERN_FN_SHARED           = 0x2000,
+       /* fname statahead workload similar to mdtest unique dir stat() */
+       LSA_PATTERN_FN_UNIQUE           = 0x4000,
+       /* fname statahead workload with stride regularized naming format */
+       LSA_PATTERN_FN_STRIDE           = 0x8000,
+       LSA_PATTERN_MASK                = (LSA_PATTERN_LIST |
+                                          LSA_PATTERN_FNAME |
+                                          LSA_PATTERN_ADVISE),
        LSA_PATTERN_MAX,
 };
 
@@ -205,6 +224,18 @@ struct ll_inode_info {
                        unsigned int                    lli_sa_generation;
                        /* access pattern for statahead */
                        enum ll_sa_pattern              lli_sa_pattern;
+                       /*
+                        * suffix index number of the latest stat dentry. It
+                        * is used for the detection of the file name statahead
+                        * pattern.
+                        */
+                       __u32                           lli_sa_fname_index;
+                       /*
+                        * indicate the count that the suffix index number
+                        * matched continuously. This field is using for the
+                        * detection of the file name statahead pattern.
+                        */
+                       unsigned int                    lli_sa_match_count;
                        /* rw lock protects lli_lsm_md */
                        struct rw_semaphore             lli_lsm_sem;
                        /* directory stripe information */
@@ -872,6 +903,7 @@ struct ll_sb_info {
        unsigned int              ll_sa_batch_max;/* max SUB request count in
                                                   * a batch PTLRPC request */
        unsigned int              ll_sa_max;     /* max statahead RPCs */
+       unsigned int              ll_sa_min;     /* min statahead req count */
        atomic_t                  ll_sa_total;   /* statahead thread started
                                                  * count */
        atomic_t                  ll_sa_wrong;   /* statahead thread stopped for
@@ -881,6 +913,16 @@ struct ll_sb_info {
        atomic_t                  ll_agl_total;  /* AGL thread started count */
        atomic_t                  ll_sa_hit_total;  /* total hit count */
        atomic_t                  ll_sa_miss_total; /* total miss count */
+       /* statahead thread count started for directory traversing pattern. */
+       atomic_t                  ll_sa_list_total;
+       /* statahead thread count started for regularized file name pattern. */
+       atomic_t                  ll_sa_fname_total;
+       /*
+        * stop the statahead thread if it is not doing a stat() in such time
+        * period as it probably does not care too much about performance or
+        * the user is no longer using this directory.
+        */
+       unsigned long             ll_sa_timeout;
 
        dev_t                     ll_sdev_orig; /* save s_dev before assign for
                                                 * clustred nfs */
@@ -1653,9 +1695,12 @@ void ll_ra_stats_inc(struct inode *inode, enum ra_stat which);
 
 /* statahead.c */
 
-#define LL_SA_RPC_MIN           8
-#define LL_SA_RPC_DEF           128
-#define LL_SA_RPC_MAX           2048
+#define LL_SA_REQ_MIN           2
+#define LL_SA_REQ_MIN_DEF      8
+#define LL_SA_REQ_MAX          2048
+#define LL_SA_REQ_MAX_DEF      128
+
+#define LL_SA_TIMEOUT_DEF      30
 
 /* XXX: If want to support more concurrent statahead instances,
  *     please consider to decentralize the RPC lists attached
@@ -1671,6 +1716,9 @@ void ll_ra_stats_inc(struct inode *inode, enum ra_stat which);
 #define LL_SA_CACHE_SIZE        (1 << LL_SA_CACHE_BIT)
 #define LL_SA_CACHE_MASK        (LL_SA_CACHE_SIZE - 1)
 
+#define LSA_FN_PREDICT_HIT     2
+#define LSA_FN_MATCH_HIT       4
+
 /* statahead controller, per process struct, for dir only */
 struct ll_statahead_info {
        struct dentry          *sai_dentry;
@@ -1709,8 +1757,23 @@ struct ll_statahead_info {
        __u32                   sai_max_batch_count;
        __u64                   sai_index_end;
 
-       __u64                   sai_fstart;
-       __u64                   sai_fend;
+       union {
+               /* for ADVISE statahead pattern */
+               struct {
+                       __u64   sai_fstart;
+                       __u64   sai_fend;
+               };
+
+               /* for FNAME statahead pattern */
+               struct {
+                       __u64   sai_fname_index;
+                       /*
+                        * The length of file name statahead pattern where the
+                        * front part is padding with 0.
+                        */
+                       __u8    sai_fname_zeroed_len;
+               };
+       };
        char                    sai_fname[NAME_MAX];
 };
 
@@ -1728,6 +1791,7 @@ int ll_revalidate_statahead(struct inode *dir, struct dentry **dentry,
 int ll_start_statahead(struct inode *dir, struct dentry *dentry, bool agl);
 void ll_authorize_statahead(struct inode *dir, void *key);
 void ll_deauthorize_statahead(struct inode *dir, void *key);
+void ll_statahead_enter(struct inode *dir, struct dentry *dentry);
 
 /* glimpse.c */
 blkcnt_t dirty_cnt(struct inode *inode);
@@ -1804,7 +1868,10 @@ dentry_may_statahead(struct inode *dir, struct dentry *dentry)
            ldd->lld_sa_generation == lli->lli_sa_generation)
                return false;
 
-       if (lli->lli_sa_pattern == LSA_PATTERN_FNAME)
+       if (lli->lli_sa_pattern & LSA_PATTERN_ADVISE)
+               return true;
+
+       if (lli->lli_sa_pattern & (LSA_PATTERN_FNAME | LSA_PATTERN_FN_PREDICT))
                return true;
 
        /* not the same process, don't statahead */
index 199b5b0..6ddd3b9 100644 (file)
@@ -183,13 +183,17 @@ static struct ll_sb_info *ll_init_sbi(struct lustre_sb_info *lsi)
        /* metadata statahead is enabled by default */
        sbi->ll_sa_running_max = LL_SA_RUNNING_DEF;
        sbi->ll_sa_batch_max = LL_SA_BATCH_DEF;
-       sbi->ll_sa_max = LL_SA_RPC_DEF;
+       sbi->ll_sa_max = LL_SA_REQ_MAX_DEF;
+       sbi->ll_sa_min = LL_SA_REQ_MIN_DEF;
+       sbi->ll_sa_timeout = LL_SA_TIMEOUT_DEF;
        atomic_set(&sbi->ll_sa_total, 0);
        atomic_set(&sbi->ll_sa_wrong, 0);
        atomic_set(&sbi->ll_sa_running, 0);
        atomic_set(&sbi->ll_agl_total, 0);
        atomic_set(&sbi->ll_sa_hit_total, 0);
        atomic_set(&sbi->ll_sa_miss_total, 0);
+       atomic_set(&sbi->ll_sa_list_total, 0);
+       atomic_set(&sbi->ll_sa_fname_total, 0);
        set_bit(LL_SBI_AGL_ENABLED, sbi->ll_flags);
        set_bit(LL_SBI_FAST_READ, sbi->ll_flags);
        set_bit(LL_SBI_TINY_WRITE, sbi->ll_flags);
index ef8fabf..060e8fb 100644 (file)
@@ -864,10 +864,10 @@ static ssize_t statahead_max_store(struct kobject *kobj,
        if (rc)
                return rc;
 
-       if (val > LL_SA_RPC_MAX) {
+       if (val > LL_SA_REQ_MAX) {
                CWARN("%s: statahead_max value %lu limited to maximum %d\n",
-                     sbi->ll_fsname, val, LL_SA_RPC_MAX);
-               val = LL_SA_RPC_MAX;
+                     sbi->ll_fsname, val, LL_SA_REQ_MAX);
+               val = LL_SA_REQ_MAX;
        }
 
        sbi->ll_sa_max = val;
@@ -875,6 +875,72 @@ static ssize_t statahead_max_store(struct kobject *kobj,
 }
 LUSTRE_RW_ATTR(statahead_max);
 
+static ssize_t statahead_min_show(struct kobject *kobj,
+                                 struct attribute *attr,
+                                 char *buf)
+{
+       struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+                                             ll_kset.kobj);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n", sbi->ll_sa_min);
+}
+
+static ssize_t statahead_min_store(struct kobject *kobj,
+                                  struct attribute *attr,
+                                  const char *buffer,
+                                  size_t count)
+{
+       struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+                                             ll_kset.kobj);
+       unsigned long val;
+       int rc;
+
+       rc = kstrtoul(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       if (val < LL_SA_REQ_MIN)
+               CERROR("%s: bad statahead_min %lu < min %u\n",
+                      sbi->ll_fsname, val, LL_SA_REQ_MIN);
+       else if (val > sbi->ll_sa_max)
+               CERROR("%s: bad statahead_min %lu > max %u\n",
+                      sbi->ll_fsname, val, sbi->ll_sa_max);
+       else
+               sbi->ll_sa_min = val;
+
+       return count;
+}
+LUSTRE_RW_ATTR(statahead_min);
+
+static ssize_t statahead_timeout_show(struct kobject *kobj,
+                                     struct attribute *attr,
+                                     char *buf)
+{
+       struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+                                             ll_kset.kobj);
+
+       return scnprintf(buf, PAGE_SIZE, "%lu\n", sbi->ll_sa_timeout);
+}
+
+static ssize_t statahead_timeout_store(struct kobject *kobj,
+                                      struct attribute *attr,
+                                      const char *buffer,
+                                      size_t count)
+{
+       struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+                                             ll_kset.kobj);
+       unsigned long val;
+       int rc;
+
+       rc = kstrtoul(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       sbi->ll_sa_timeout = val;
+       return count;
+}
+LUSTRE_RW_ATTR(statahead_timeout);
+
 static ssize_t statahead_agl_show(struct kobject *kobj,
                                  struct attribute *attr,
                                  char *buf)
@@ -917,11 +983,15 @@ static int ll_statahead_stats_seq_show(struct seq_file *m, void *v)
        seq_printf(m, "statahead total: %u\n"
                      "statahead wrong: %u\n"
                      "agl total: %u\n"
+                     "list_total: %u\n"
+                     "fname_total: %u\n"
                      "hit_total: %u\n"
                      "miss_total: %u\n",
                   atomic_read(&sbi->ll_sa_total),
                   atomic_read(&sbi->ll_sa_wrong),
                   atomic_read(&sbi->ll_agl_total),
+                  atomic_read(&sbi->ll_sa_list_total),
+                  atomic_read(&sbi->ll_sa_fname_total),
                   atomic_read(&sbi->ll_sa_hit_total),
                   atomic_read(&sbi->ll_sa_miss_total));
        return 0;
@@ -938,6 +1008,8 @@ static ssize_t ll_statahead_stats_seq_write(struct file *file,
        atomic_set(&sbi->ll_sa_total, 0);
        atomic_set(&sbi->ll_sa_wrong, 0);
        atomic_set(&sbi->ll_agl_total, 0);
+       atomic_set(&sbi->ll_sa_list_total, 0);
+       atomic_set(&sbi->ll_sa_fname_total, 0);
        atomic_set(&sbi->ll_sa_hit_total, 0);
        atomic_set(&sbi->ll_sa_miss_total, 0);
 
@@ -1950,6 +2022,8 @@ static struct attribute *llite_attrs[] = {
        &lustre_attr_statahead_running_max.attr,
        &lustre_attr_statahead_batch_max.attr,
        &lustre_attr_statahead_max.attr,
+       &lustre_attr_statahead_min.attr,
+       &lustre_attr_statahead_timeout.attr,
        &lustre_attr_statahead_agl.attr,
        &lustre_attr_lazystatfs.attr,
        &lustre_attr_statfs_max_age.attr,
index 29e32e9..c27b315 100644 (file)
@@ -548,7 +548,8 @@ static inline void ll_sax_put(struct inode *dir,
        if (atomic_dec_and_lock(&ctx->sax_refcount, &lli->lli_sa_lock)) {
                lli->lli_sai = NULL;
                lli->lli_sax = NULL;
-               if (lli->lli_sa_pattern == LSA_PATTERN_FNAME) {
+               if (lli->lli_sa_pattern & (LSA_PATTERN_ADVISE |
+                                          LSA_PATTERN_FNAME)) {
                        lli->lli_opendir_key = NULL;
                        lli->lli_opendir_pid = 0;
                        lli->lli_sa_enabled = 0;
@@ -574,7 +575,7 @@ static struct ll_statahead_info *ll_sai_alloc(struct dentry *dentry)
 
        sai->sai_dentry = dget(dentry);
        atomic_set(&sai->sai_refcount, 1);
-       sai->sai_max = LL_SA_RPC_MIN;
+       sai->sai_max = ll_i2sbi(dentry->d_inode)->ll_sa_min;
        sai->sai_index = 1;
        init_waitqueue_head(&sai->sai_waitq);
 
@@ -1313,8 +1314,55 @@ static int ll_statahead_by_list(struct dentry *parent)
        RETURN(rc);
 }
 
-static int ll_statahead_by_fname(struct ll_statahead_info *sai,
-                                struct dentry *parent)
+static void ll_statahead_handle(struct ll_statahead_info *sai,
+                               struct dentry *parent, const char *name,
+                               int len, const struct lu_fid *fid)
+{
+       struct inode *dir = parent->d_inode;
+       struct ll_inode_info *lli = ll_i2info(dir);
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       long timeout;
+
+       while (({set_current_state(TASK_IDLE);
+               /* matches smp_store_release() in ll_deauthorize_statahead() */
+                smp_load_acquire(&sai->sai_task); })) {
+               spin_lock(&lli->lli_agl_lock);
+               while (sa_sent_full(sai) && !agl_list_empty(sai)) {
+                       struct ll_inode_info *clli;
+
+                       __set_current_state(TASK_RUNNING);
+                       clli = agl_first_entry(sai);
+                       list_del_init(&clli->lli_agl_list);
+                       spin_unlock(&lli->lli_agl_lock);
+
+                       ll_agl_trigger(&clli->lli_vfs_inode, sai);
+                       cond_resched();
+                       spin_lock(&lli->lli_agl_lock);
+               }
+               spin_unlock(&lli->lli_agl_lock);
+
+               if (!sa_sent_full(sai))
+                       break;
+
+               /*
+                * If the thread is not doing a stat in 30s then it probably
+                * does not care too much about performance, or is no longer
+                * using this directory. Stop the statahead thread in this case.
+                */
+               timeout = schedule_timeout(
+                               cfs_time_seconds(sbi->ll_sa_timeout));
+               if (timeout == 0) {
+                       lli->lli_sa_enabled = 0;
+                       break;
+               }
+       }
+       __set_current_state(TASK_RUNNING);
+
+       sa_statahead(sai, parent, name, len, fid);
+}
+
+static int ll_statahead_by_advise(struct ll_statahead_info *sai,
+                                 struct dentry *parent)
 {
        struct inode *dir = parent->d_inode;
        struct ll_inode_info *lli = ll_i2info(dir);
@@ -1328,7 +1376,7 @@ static int ll_statahead_by_fname(struct ll_statahead_info *sai,
 
        ENTRY;
 
-       CDEBUG(D_READA, "%s: FNAME statahead: parent %pd fname prefix %s\n",
+       CDEBUG(D_READA, "%s: ADVISE statahead: parent %pd fname prefix %s\n",
               sbi->ll_fsname, parent, sai->sai_fname);
 
        OBD_ALLOC(fname, NAME_MAX);
@@ -1341,42 +1389,70 @@ static int ll_statahead_by_fname(struct ll_statahead_info *sai,
        ptr = fname + len;
 
        /* matches smp_store_release() in ll_deauthorize_statahead() */
-       while (smp_load_acquire(&sai->sai_task)) {
+       while (smp_load_acquire(&sai->sai_task) && lli->lli_sa_enabled) {
                size_t numlen;
 
                numlen = snprintf(ptr, max_len, "%llu",
                                  sai->sai_fstart + i);
 
-               while (({set_current_state(TASK_IDLE);
-                        /*
-                         * matches smp_store_release() in
-                         * ll_deauthorize_statahead()
-                         */
-                        smp_load_acquire(&sai->sai_task); })) {
-                       spin_lock(&lli->lli_agl_lock);
-                       while (sa_sent_full(sai) && !agl_list_empty(sai)) {
-                               struct ll_inode_info *clli;
+               ll_statahead_handle(sai, parent, fname, len + numlen, NULL);
+               if (++i >= sai->sai_fend)
+                       break;
+       }
 
-                               __set_current_state(TASK_RUNNING);
-                               clli = agl_first_entry(sai);
-                               list_del_init(&clli->lli_agl_list);
-                               spin_unlock(&lli->lli_agl_lock);
+       OBD_FREE(fname, NAME_MAX);
+       RETURN(rc);
+}
 
-                               ll_agl_trigger(&clli->lli_vfs_inode, sai);
-                               cond_resched();
-                               spin_lock(&lli->lli_agl_lock);
-                       }
-                       spin_unlock(&lli->lli_agl_lock);
+static int ll_statahead_by_fname(struct ll_statahead_info *sai,
+                                struct dentry *parent)
+{
+       struct inode *dir = parent->d_inode;
+       struct ll_inode_info *lli = ll_i2info(dir);
+       struct ll_sb_info *sbi = ll_i2sbi(dir);
+       size_t max_len;
+       size_t len;
+       char *fname;
+       char *ptr;
+       int rc = 0;
 
-                       if (!sa_sent_full(sai))
-                               break;
-                       schedule();
-               }
-               __set_current_state(TASK_RUNNING);
+       ENTRY;
 
-               sa_statahead(sai, parent, fname, len + numlen, NULL);
-               if (++i >= sai->sai_fend)
+       CDEBUG(D_READA, "%s: FNAME statahead: parent %pd fname prefix %s\n",
+              sbi->ll_fsname, parent, sai->sai_fname);
+
+       OBD_ALLOC(fname, NAME_MAX);
+       if (fname == NULL)
+               RETURN(-ENOMEM);
+
+       len = strlen(sai->sai_fname);
+       memcpy(fname, sai->sai_fname, len);
+       max_len = sizeof(sai->sai_fname) - len;
+       ptr = fname + len;
+
+       /* matches smp_store_release() in ll_deauthorize_statahead() */
+       while (smp_load_acquire(&sai->sai_task) && lli->lli_sa_enabled) {
+               size_t numlen;
+
+               if (sai->sai_fname_zeroed_len)
+                       numlen = snprintf(ptr, max_len, "%0*llu",
+                                         sai->sai_fname_zeroed_len,
+                                         ++sai->sai_fname_index);
+               else
+                       numlen = snprintf(ptr, max_len, "%llu",
+                                         ++sai->sai_fname_index);
+
+               ll_statahead_handle(sai, parent, fname, len + numlen, NULL);
+
+               if (sa_low_hit(sai)) {
+                       rc = -EFAULT;
+                       atomic_inc(&sbi->ll_sa_wrong);
+                       CDEBUG(D_CACHE, "%s: low hit ratio for %pd "DFID": hit=%llu miss=%llu sent=%llu replied=%llu, stopping PID %d\n",
+                              sbi->ll_fsname, parent, PFID(ll_inode2fid(dir)),
+                              sai->sai_hit, sai->sai_miss, sai->sai_sent,
+                              sai->sai_replied, current->pid);
                        break;
+               }
        }
 
        OBD_FREE(fname, NAME_MAX);
@@ -1409,10 +1485,13 @@ static int ll_statahead_thread(void *arg)
 
        sai->sai_bh = bh;
 
-       switch (lli->lli_sa_pattern) {
+       switch (lli->lli_sa_pattern & LSA_PATTERN_MASK) {
        case LSA_PATTERN_LIST:
                rc = ll_statahead_by_list(parent);
                break;
+       case LSA_PATTERN_ADVISE:
+               rc = ll_statahead_by_advise(sai, parent);
+               break;
        case LSA_PATTERN_FNAME:
                rc = ll_statahead_by_fname(sai, parent);
                break;
@@ -1436,7 +1515,7 @@ static int ll_statahead_thread(void *arg)
         */
        while (({set_current_state(TASK_IDLE);
                /* matches smp_store_release() in ll_deauthorize_statahead() */
-               smp_load_acquire(&sai->sai_task); })) {
+               smp_load_acquire(&sai->sai_task) && lli->lli_sa_enabled; })) {
                schedule();
        }
        __set_current_state(TASK_RUNNING);
@@ -1494,7 +1573,7 @@ void ll_authorize_statahead(struct inode *dir, void *key)
        spin_unlock(&lli->lli_sa_lock);
 }
 
-static void ll_deauthorize_statahead_fname(struct inode *dir, void *key)
+static void ll_deauthorize_statahead_advise(struct inode *dir, void *key)
 {
        struct ll_inode_info *lli = ll_i2info(dir);
        struct ll_file_data *fd = (struct ll_file_data *)key;
@@ -1507,7 +1586,8 @@ static void ll_deauthorize_statahead_fname(struct inode *dir, void *key)
        if (sai->sai_task) {
                struct task_struct *task = sai->sai_task;
 
-               sai->sai_task = NULL;
+               /* matches smp_load_acquire() in ll_statahead_thread() */
+               smp_store_release(&sai->sai_task, NULL);
                wake_up_process(task);
        }
        fd->fd_sai = NULL;
@@ -1526,21 +1606,23 @@ void ll_deauthorize_statahead(struct inode *dir, void *key)
        struct ll_inode_info *lli = ll_i2info(dir);
        struct ll_statahead_info *sai;
 
-       LASSERT(lli->lli_opendir_pid != 0);
-
        CDEBUG(D_READA, "deauthorize statahead for "DFID"\n",
               PFID(&lli->lli_fid));
 
-       if (lli->lli_sa_pattern == LSA_PATTERN_FNAME) {
-               ll_deauthorize_statahead_fname(dir, key);
+       if (lli->lli_sa_pattern & LSA_PATTERN_ADVISE) {
+               ll_deauthorize_statahead_advise(dir, key);
                return;
        }
 
+       LASSERT(lli->lli_opendir_pid != 0);
        LASSERT(lli->lli_opendir_key == key);
        spin_lock(&lli->lli_sa_lock);
        lli->lli_opendir_key = NULL;
        lli->lli_opendir_pid = 0;
        lli->lli_sa_enabled = 0;
+       lli->lli_sa_pattern = LSA_PATTERN_NONE;
+       lli->lli_sa_fname_index = 0;
+       lli->lli_sa_match_count = 0;
        sai = lli->lli_sai;
        if (sai && sai->sai_task) {
                /*
@@ -1774,9 +1856,10 @@ static int revalidate_statahead_dentry(struct inode *dir,
        if (!entry)
                GOTO(out, rc = -EAGAIN);
 
-       if (lli->lli_sa_pattern == LSA_PATTERN_LIST)
+       if (lli->lli_sa_pattern & LSA_PATTERN_LIST ||
+           lli->lli_sa_pattern & LSA_PATTERN_FNAME)
                LASSERT(sai == entry->se_sai);
-       else if (lli->lli_sa_pattern == LSA_PATTERN_FNAME)
+       else if (lli->lli_sa_pattern == LSA_PATTERN_ADVISE)
                sai = entry->se_sai;
 
        LASSERT(sai != NULL);
@@ -1860,6 +1943,113 @@ out:
        RETURN(rc);
 }
 
+static inline bool
+sa_pattern_list_detect(struct inode *dir, struct dentry *dchild, int *first)
+{
+       struct ll_inode_info *lli = ll_i2info(dir);
+
+       if (lli->lli_opendir_pid == 0)
+               return false;
+
+       if (lli->lli_sa_enabled == 0)
+               return false;
+
+       if (lli->lli_sa_pattern & LSA_PATTERN_LS_NOT_FIRST_DE)
+               return false;
+
+       *first = is_first_dirent(dir, dchild);
+       if (*first == LS_NOT_FIRST_DE) {
+               /*
+                * It is not "ls -{a}l" operation, no need statahead for it.
+                * Disable statahead so that subsequent stat() won't waste
+                * time to try it.
+                */
+               spin_lock(&lli->lli_sa_lock);
+               if (lli->lli_opendir_pid == current->pid) {
+                       lli->lli_sa_enabled = 0;
+                       lli->lli_sa_pattern |= LSA_PATTERN_LS_NOT_FIRST_DE;
+               }
+               spin_unlock(&lli->lli_sa_lock);
+               return false;
+       }
+
+       spin_lock(&lli->lli_sa_lock);
+       lli->lli_sa_pattern |= LSA_PATTERN_LIST;
+       spin_unlock(&lli->lli_sa_lock);
+       return true;
+}
+
+static inline bool
+sa_pattern_fname_detect(struct inode *dir, struct dentry *dchild)
+{
+       struct ll_inode_info *lli = ll_i2info(dir);
+       struct qstr *dname = &dchild->d_name;
+       const unsigned char *name = dname->name;
+       bool rc = false;
+       int i;
+
+       if (ll_i2sbi(dir)->ll_enable_statahead_fname == 0)
+               return false;
+
+       /*
+        * Parse the format of the file name to determine whether it matches
+        * the supported file name pattern for statahead (i.e. mdtest.$i).
+        */
+       i = dname->len - 1;
+       if (isdigit(name[i])) {
+               long num;
+               int ret;
+
+               while (--i >= 0 && isdigit(name[i]))
+                       /* do nothing */;
+               i++;
+               ret = kstrtol(&name[i], 0, &num);
+               if (ret)
+                       GOTO(out, rc);
+
+               /*
+                * The traversing program do multiple stat() calls on the same
+                * children entry. i.e. ls $dir*.
+                */
+               if (lli->lli_sa_fname_index == num)
+                       return false;
+
+               if (lli->lli_sa_match_count == 0 ||
+                   num == lli->lli_sa_fname_index + 1) {
+                       lli->lli_sa_match_count++;
+                       lli->lli_sa_fname_index = num;
+
+                       if (lli->lli_sa_match_count > LSA_FN_MATCH_HIT) {
+                               lli->lli_sa_pattern |= LSA_PATTERN_FN_UNIQUE;
+                               GOTO(out, rc = true);
+                       }
+
+                       return false;
+               }
+       }
+out:
+       spin_lock(&lli->lli_sa_lock);
+       if (rc) {
+               lli->lli_sa_pattern |= LSA_PATTERN_FNAME;
+       } else {
+               lli->lli_sa_pattern = LSA_PATTERN_NONE;
+               lli->lli_sa_match_count = 0;
+               lli->lli_sa_fname_index = 0;
+               lli->lli_sa_enabled = 0;
+       }
+       spin_unlock(&lli->lli_sa_lock);
+
+       return rc;
+}
+
+/* detect the statahead pattern. */
+static inline bool
+sa_pattern_detect(struct inode *dir, struct dentry *dchild, int *first)
+{
+       return sa_pattern_list_detect(dir, dchild, first) ||
+              sa_pattern_fname_detect(dir, dchild);
+}
+
 /**
  * start statahead thread
  *
@@ -1888,11 +2078,8 @@ static int start_statahead_thread(struct inode *dir, struct dentry *dentry,
 
        ENTRY;
 
-       /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */
-       first = is_first_dirent(dir, dentry);
-       if (first == LS_NOT_FIRST_DE)
-               /* It is not "ls -{a}l" operation, no need statahead for it. */
-               GOTO(out, rc = -EFAULT);
+       if (sa_pattern_detect(dir, dentry, &first) == false)
+               RETURN(0);
 
        if (unlikely(atomic_inc_return(&sbi->ll_sa_running) >
                                       sbi->ll_sa_running_max)) {
@@ -1911,21 +2098,43 @@ static int start_statahead_thread(struct inode *dir, struct dentry *dentry,
 
        sai->sai_ls_all = (first == LS_FIRST_DOT_DE);
 
+       if (lli->lli_sa_pattern & LSA_PATTERN_FNAME) {
+               struct qstr *dname = &dentry->d_name;
+               const unsigned char *name = dname->name;
+               int rc;
+               int i;
+
+               if (dname->len >= sizeof(sai->sai_fname))
+                       GOTO(out, rc = -ERANGE);
+
+               i = dname->len;
+               while (--i >= 0 && isdigit(name[i]))
+                       /* do nothing */;
+               i++;
+
+               memcpy(sai->sai_fname, dname->name, i);
+               sai->sai_fname[i] = '\0';
+               sai->sai_fname_index = lli->lli_sa_fname_index;
+               /* The front part of the file name is zeroed padding. */
+               if (name[i] == '0')
+                       sai->sai_fname_zeroed_len = dname->len - i;
+       }
+
        /*
         * if current lli_opendir_key was deauthorized, or dir re-opened by
         * another process, don't start statahead, otherwise the newly spawned
         * statahead thread won't be notified to quit.
         */
        spin_lock(&lli->lli_sa_lock);
-       if (unlikely(lli->lli_sai || !lli->lli_opendir_key ||
-                    lli->lli_opendir_pid != current->pid ||
-                    lli->lli_sa_pattern != LSA_PATTERN_NONE)) {
+       if (unlikely(lli->lli_sai ||
+                    ((lli->lli_sa_pattern & LSA_PATTERN_LIST) &&
+                     !lli->lli_opendir_key &&
+                     lli->lli_opendir_pid != current->pid))) {
                spin_unlock(&lli->lli_sa_lock);
                GOTO(out, rc = -EPERM);
        }
        lli->lli_sai = sai;
        lli->lli_sax = ctx;
-       lli->lli_sa_pattern = LSA_PATTERN_LIST;
        spin_unlock(&lli->lli_sa_lock);
 
        CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %pd]\n",
@@ -1946,8 +2155,12 @@ static int start_statahead_thread(struct inode *dir, struct dentry *dentry,
                ll_start_agl(parent, sai);
 
        atomic_inc(&sbi->ll_sa_total);
-       sai->sai_task = task;
+       if (lli->lli_sa_pattern & LSA_PATTERN_LIST)
+               atomic_inc(&sbi->ll_sa_list_total);
+       else if (lli->lli_sa_pattern & LSA_PATTERN_FNAME)
+               atomic_inc(&sbi->ll_sa_fname_total);
 
+       sai->sai_task = task;
        wake_up_process(task);
        /*
         * We don't stat-ahead for the first dirent since we are already in
@@ -2123,15 +2336,14 @@ int ll_ioctl_ahead(struct file *file, struct llapi_lu_ladvise2 *ladvise)
                        struct ll_statahead_context *tmp = ctx;
 
                        if (lli->lli_sa_pattern == LSA_PATTERN_NONE ||
-                           lli->lli_sa_pattern == LSA_PATTERN_FNAME) {
-                               lli->lli_sa_pattern = LSA_PATTERN_FNAME;
+                           lli->lli_sa_pattern == LSA_PATTERN_ADVISE) {
+                               lli->lli_sa_pattern = LSA_PATTERN_ADVISE;
                                ctx = lli->lli_sax;
                                __ll_sax_get(ctx);
                                fd->fd_sai = __ll_sai_get(sai);
                                rc = 0;
-                       } else {
                                rc = -EINVAL;
-                               CWARN("%s: pattern %X is not FNAME: rc = %d\n",
+                               CWARN("%s: pattern %X is not ADVISE: rc = %d\n",
                                      sbi->ll_fsname, lli->lli_sa_pattern, rc);
                        }
 
@@ -2140,20 +2352,20 @@ int ll_ioctl_ahead(struct file *file, struct llapi_lu_ladvise2 *ladvise)
                        if (rc)
                                GOTO(out, rc);
                } else {
-                       lli->lli_sa_pattern = LSA_PATTERN_FNAME;
+                       lli->lli_sa_pattern = LSA_PATTERN_ADVISE;
                        lli->lli_sax = ctx;
                        fd->fd_sai = __ll_sai_get(sai);
                        spin_unlock(&lli->lli_sa_lock);
                }
        } else {
                spin_lock(&lli->lli_sa_lock);
-               if (!(lli->lli_sa_pattern == LSA_PATTERN_FNAME ||
+               if (!(lli->lli_sa_pattern == LSA_PATTERN_ADVISE ||
                      lli->lli_sa_pattern == LSA_PATTERN_NONE)) {
                        spin_unlock(&lli->lli_sa_lock);
                        GOTO(out, rc = -EINVAL);
                }
 
-               lli->lli_sa_pattern = LSA_PATTERN_FNAME;
+               lli->lli_sa_pattern = LSA_PATTERN_ADVISE;
                fd->fd_sai = __ll_sai_get(sai);
                spin_unlock(&lli->lli_sa_lock);
        }
@@ -2196,3 +2408,48 @@ out:
        atomic_dec(&sbi->ll_sa_running);
        RETURN(rc);
 }
+
+/*
+ * This function is called in each stat() system call to do statahead check.
+ * When the files' naming of stat() call sequence under a directory follows
+ * a certain name rule roughly, this directory is considered as an condicant
+ * to do statahead.
+ * For an example, the file naming rule is mdtest.$rank.$i, the suffix of
+ * the stat() dentry name is number and do stat() for dentries with name
+ * ending with number more than @LSA_FN_PREDICT_HIT, then the corresponding
+ * directory is met the requrirement for statahead.
+ */
+void ll_statahead_enter(struct inode *dir, struct dentry *dchild)
+{
+       struct ll_inode_info *lli;
+       struct qstr *dname = &dchild->d_name;
+
+       if (ll_i2sbi(dir)->ll_sa_max == 0)
+               return;
+
+       if (ll_i2sbi(dir)->ll_enable_statahead_fname == 0)
+               return;
+
+       lli = ll_i2info(dir);
+       if (lli->lli_sa_enabled)
+               return;
+
+       if (lli->lli_sa_pattern & (LSA_PATTERN_FN_PREDICT | LSA_PATTERN_LIST))
+               return;
+
+       /*
+        * Now support number indexing regularized statahead pattern only.
+        * Quick check whether the last character is digit.
+        */
+       if (!isdigit(dname->name[dname->len - 1])) {
+               lli->lli_sa_match_count = 0;
+               return;
+       }
+
+       lli->lli_sa_match_count++;
+       if (lli->lli_sa_match_count > LSA_FN_PREDICT_HIT) {
+               lli->lli_sa_pattern |= LSA_PATTERN_FN_PREDICT;
+               lli->lli_sa_enabled = 1;
+               lli->lli_sa_match_count = 0;
+       }
+}
index 493dce8..9dcf442 100644 (file)
@@ -2058,6 +2058,7 @@ int req_capsule_server_pack(struct req_capsule *pill)
                                len += max;
                        else
                                len += len;
+
                        rc = req_capsule_server_grow(&req->rq_pill,
                                                     &RMF_BUT_REPLY, len);
                        if (rc)
index 3dd3351..ced10df 100755 (executable)
@@ -1983,7 +1983,7 @@ test_27p() {
 
        reset_enospc
        rm -f $DIR/$tdir/$tfile
-       test_mkdir $DIR/$tdir
+       test_mkdir $DIR/$tdir || error "failed to mkdir $DIR/$tdir"
 
        $MCREATE $DIR/$tdir/$tfile || error "mcreate failed"
        $TRUNCATE $DIR/$tdir/$tfile 80000000 || error "truncate failed"
@@ -14526,11 +14526,117 @@ test_123g() {
                awk '/hit.total:/ {print $2}')
        echo "Hit total: $count"
        # Hit ratio should be >= 75%
-       (( $count > num * 75 / 100)) ||
+       (( $count > num * 75 / 100 )) ||
                error "hit total $count is be > 75% of $num"
 }
 run_test 123g "Test for stat-ahead advise"
 
+test_123h_base() {
+       local dir=$DIR/$tdir
+       local cmd="touch $dir/$tfile.{$1}"
+       local fcnt=$2
+
+       stack_trap "rm -rf $dir"
+       mkdir -p $dir || error "failed to mkdir $dir"
+       eval $cmd
+
+       cancel_lru_locks mdc
+       $LCTL set_param llite.*.statahead_stats=clear
+       $LCTL set_param mdc.*.batch_stats=0
+       $LCTL set_param llite.*.statahead_max=1024
+       $LCTL set_param llite.*.statahead_batch_max=1024
+       lctl get_param -n llite.*.statahead_stats
+       du -a $dir > /dev/null
+       echo "Wait statahead thread (ll_sa_xxx) to exit..."
+       wait_update_facet client "pgrep ll_sa" "" 35 ||
+               error "ll_sa statahead thread does not quit in 35s"
+       $LCTL get_param -n llite.*.statahead_stats
+       $LCTL get_param -n mdc.*.batch_stats
+
+       local count=$($LCTL get_param -n llite.*.statahead_stats |
+                       awk '/fname.total:/ {print $2}')
+
+       [ $count == 1 ] || error "File name pattern statahead not trigger"
+       count=$($LCTL get_param -n llite.*.statahead_stats |
+               awk '/hit.total:/ {print $2}')
+       # Hit ratio should be >= 75%
+       (( $count > fcnt * 75 / 100 )) ||
+               error "hit total is too low: $count"
+       rm -rf $dir || error "rm -rf $dir failed"
+}
+
+test_123h() {
+       local max
+       local batch_max
+       local enabled
+
+       max=$($LCTL get_param -n llite.*.statahead_max | head -n 1)
+       batch_max=$($LCTL get_param -n llite.*.statahead_batch_max | head -n 1)
+       enabled=$($LCTL get_param -n llite.*.enable_statahead_fname | head -n 1)
+       stack_trap "$LCTL set_param llite.*.statahead_max=$max"
+       stack_trap "$LCTL set_param llite.*.statahead_batch_max=$batch_max"
+       stack_trap "$LCTL set_param llite.*.enable_statahead_fname=$enabled"
+
+       $LCTL set_param llite.*.enable_statahead_fname=1
+
+       echo "Scan a directory with number regularized fname"
+       test_123h_base "0..10000" 10000
+
+       echo "Scan a directory with zeroed padding number regularized fname"
+       test_123h_base "000000..010000" 10000
+}
+run_test 123h "Verify statahead work with the fname pattern via du"
+
+test_123i() {
+       local dir=$DIR/$tdir
+       local cmd="createmany -m $dir/$tfile.%06d 1000"
+
+       stack_trap "unlinkmany $dir/$tfile.%06d 1000"
+       mkdir -p $dir || error "failed to mkdir $dir"
+       eval $cmd
+
+       cancel_lru_locks mdc
+       $LCTL set_param llite.*.statahead_stats=clear
+       $LCTL set_param mdc.*.batch_stats=0
+
+       local max
+       local batch_max
+       local enabled
+
+       max=$($LCTL get_param -n llite.*.statahead_max | head -n 1)
+       batch_max=$($LCTL get_param -n llite.*.statahead_batch_max | head -n 1)
+       enabled=$($LCTL get_param -n llite.*.enable_statahead_fname | head -n 1)
+       stack_trap "$LCTL set_param llite.*.statahead_max=$max"
+       stack_trap "$LCTL set_param llite.*.statahead_batch_max=$batch_max"
+       stack_trap "$LCTL set_param llite.*.enable_statahead_fname=$enabled"
+
+       $LCTL set_param llite.*.statahead_max=1024
+       $LCTL set_param llite.*.statahead_batch_max=32
+       $LCTL set_param llite.*.enable_statahead_fname=1
+       echo "statahead_stats (Pre):"
+       lctl get_param -n llite.*.statahead_stats
+       ls $dir/* > /dev/null
+       echo "statahead_stats (Post):"
+       $LCTL get_param -n llite.*.statahead_stats
+       $LCTL get_param -n mdc.*.batch_stats
+
+       echo "Wait the statahead thread (ll_sa_xxx) to exit ..."
+       wait_update_facet client "pgrep ll_sa" "" 35 ||
+               error "ll_sa statahead thread does not quit in 35s"
+       $LCTL get_param -n llite.*.statahead_stats
+       $LCTL get_param -n mdc.*.batch_stats
+
+       local count=$($LCTL get_param -n llite.*.statahead_stats |
+                       awk '/fname.total:/ {print $2}')
+
+       [ $count == 1 ] || error "File name pattern statahead not trigger"
+       count=$($LCTL get_param -n llite.*.statahead_stats |
+               awk '/hit.total:/ {print $2}')
+       # Hit ratio should be >= 75%
+       (( $count > 75 )) || error "hit total is too low: $count"
+}
+run_test 123i "Verify statahead work with the fname pattern via ls dir/*"
+
 test_124a() {
        [ $PARALLEL == "yes" ] && skip "skip parallel run"
        $LCTL get_param -n mdc.*.connect_flags | grep -q lru_resize ||