LU-17463 osc: add support for unevictable mlock()ed pages

author Qian Yingjin <qian@ddn.com>

Fri, 26 Jan 2024 07:49:34 +0000 (02:49 -0500)

committer Oleg Drokin <green@whamcloud.com>

Fri, 23 Aug 2024 21:57:16 +0000 (21:57 +0000)
author Qian Yingjin <qian@ddn.com>
Fri, 26 Jan 2024 07:49:34 +0000 (02:49 -0500)
committer Oleg Drokin <green@whamcloud.com>
Fri, 23 Aug 2024 21:57:16 +0000 (21:57 +0000)
diff --git a/lustre.spec.in b/lustre.spec.in

index e157c0c..0d83191 100644 (file)
--- a/lustre.spec.in
+++ b/lustre.spec.in
@@ -478,7 +478,7 @@ Requires: attr, rsync, lsof, /usr/bin/getconf
  Requires: /usr/sbin/getenforce, acl, /usr/bin/killall, /usr/bin/ping, bc
  # Of the supported targets, only rhel7 doesn't support Recommends.
  %if 0%{?rhel} > 7 || 0%{?fedora} > 33 || 0%{?rhel} < 1
-Recommends: perl, dbench, iozone
+Recommends: perl, dbench, iozone, vmtouch
  # Either of these is sufficient
  Suggests: pdsh, clush
  %endif
diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h

index 0c6c81a..afe3dd8 100644 (file)
--- a/lustre/include/cl_object.h
+++ b/lustre/include/cl_object.h
@@ -2284,6 +2284,10 @@ struct cl_client_cache {
          */
         atomic_long_t           ccc_lru_left;
         /**
+        * # of unevictable LRU entries
+        */
+       atomic_long_t           ccc_unevict_lru_used;
+       /**
          * List of entities(OSCs) for this LRU cache
          */
         struct list_head        ccc_lru;
@@ -2298,7 +2302,11 @@ struct cl_client_cache {
         /**
          * Set if unstable check is enabled
          */
-       unsigned int            ccc_unstable_check:1;
+       unsigned int            ccc_unstable_check:1,
+       /**
+        * Whether unevictable (mlock pages) checking is enabled
+        */
+                               ccc_mlock_pages_enable:1;
         /**
          * # of unstable pages for this mount point
          */
diff --git a/lustre/include/lustre_osc.h b/lustre/include/lustre_osc.h

index 4713551..aba4a71 100644 (file)
--- a/lustre/include/lustre_osc.h
+++ b/lustre/include/lustre_osc.h
@@ -529,7 +529,11 @@ struct osc_page {
         /**
          * If the page is in osc_object::oo_tree.
          */
-                               ops_intree:1;
+                               ops_intree:1,
+       /**
+        * If the page is marked with PG_mlocked.
+        */
+                               ops_vm_locked:1;
         /**
          * lru page list. See osc_lru_{del|use}() in osc_page.c for usage.
          */
diff --git a/lustre/include/obd.h b/lustre/include/obd.h

index 41d9211..061119c 100644 (file)
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -295,6 +295,10 @@ struct client_obd {
         atomic_long_t            cl_lru_busy;
         /** # of LRU pages in the cache for this client_obd */
         atomic_long_t            cl_lru_in_list;
+       /**
+        * # of LRU pages marked with PG_mlocked in the cache on the client.
+        */
+       atomic_long_t            cl_unevict_lru_in_list;
         /** # of threads are shrinking LRU cache. To avoid contention, it's not
          * allowed to have multiple threads shrinking LRU cache. */
         atomic_t                 cl_lru_shrinkers;
@@ -305,6 +309,8 @@ struct client_obd {
          * reclaim is sync, initiated by IO thread when the LRU slots are
          * in shortage. */
         __u64                    cl_lru_reclaim;
+       /** List of unevictable LRU pages for this client_obd */
+       struct list_head         cl_unevict_lru_list;
         /** List of LRU pages for this client_obd */
         struct list_head         cl_lru_list;
         /** Lock for LRU page list */
@@ -842,6 +848,8 @@ static inline bool obd_mdt_recovery_abort(struct obd_device *obd)
  #define KEY_CACHE_LRU_SHRINK   "cache_lru_shrink"
  #define KEY_OSP_CONNECTED      "osp_connected"
  
+#define KEY_UNEVICT_CACHE_SHRINK       "unevict_cache_shrink"
+
  /* Flags for op_xvalid */
  enum op_xvalid {
         OP_XVALID_CTIME_SET     = BIT(0),       /* 0x0001 */
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index fdc0bb7..539e13c 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -408,7 +408,9 @@ int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         atomic_set(&cli->cl_lru_shrinkers, 0);
         atomic_long_set(&cli->cl_lru_busy, 0);
         atomic_long_set(&cli->cl_lru_in_list, 0);
+       atomic_long_set(&cli->cl_unevict_lru_in_list, 0);
         INIT_LIST_HEAD(&cli->cl_lru_list);
+       INIT_LIST_HEAD(&cli->cl_unevict_lru_list);
         spin_lock_init(&cli->cl_lru_list_lock);
         atomic_long_set(&cli->cl_unstable_count, 0);
         INIT_LIST_HEAD(&cli->cl_shrink_list);
diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c

index ee7ea29..a40920f 100644 (file)
--- a/lustre/llite/llite_mmap.c
+++ b/lustre/llite/llite_mmap.c
@@ -401,10 +401,11 @@ static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 return result;
  
         CDEBUG(D_MMAP|D_IOTRACE,
-              "START file %s:"DFID", vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n",
+              "START file %s:"DFID", vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu vmf_flags=%#x\n",
                file_dentry(vma->vm_file)->d_name.name,
                PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid),
-              vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff);
+              vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff,
+              vmf->flags);
  
         /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
          * so that it can be killed by admin but not cause segfault by
@@ -461,10 +462,10 @@ restart:
         }
  
         CDEBUG(D_IOTRACE,
-              "COMPLETED: "DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu, rc %d\n",
+              "COMPLETED: "DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu vmf_flags=%#x: rc=%d\n",
                PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid),
                vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff,
-              result);
+              vmf->flags, result);
  
         return result;
  }
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c

index 2e41ae4..956bad6 100644 (file)
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -463,16 +463,20 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
         struct ll_ra_info *ra = &sbi->ll_ra_info;
         long max_cached_mb;
         long unused_mb;
+       long unevict_mb;
  
         mutex_lock(&cache->ccc_max_cache_mb_lock);
         max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max);
         unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left));
+       unevict_mb = PAGES_TO_MiB(
+                       atomic_long_read(&cache->ccc_unevict_lru_used));
         mutex_unlock(&cache->ccc_max_cache_mb_lock);
  
         seq_printf(m, "users: %d\n"
                       "max_cached_mb: %ld\n"
                       "used_mb: %ld\n"
                       "unused_mb: %ld\n"
+                     "unevict_mb: %ld\n"
                       "reclaim_count: %u\n"
                       "max_read_ahead_mb: %lu\n"
                       "used_read_ahead_mb: %d\n",
@@ -480,6 +484,7 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
                    max_cached_mb,
                    max_cached_mb - unused_mb,
                    unused_mb,
+                  unevict_mb,
                    cache->ccc_lru_shrinkers,
                    PAGES_TO_MiB(ra->ra_max_pages),
                    PAGES_TO_MiB(atomic_read(&ra->ra_cur_pages)));
@@ -614,6 +619,97 @@ out_unlock:
  }
  LDEBUGFS_SEQ_FOPS(ll_max_cached_mb);
  
+static int ll_unevict_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct cl_client_cache *cache = sbi->ll_cache;
+       long unevict_mb;
+
+       mutex_lock(&cache->ccc_max_cache_mb_lock);
+       unevict_mb = PAGES_TO_MiB(
+                       atomic_long_read(&cache->ccc_unevict_lru_used));
+       mutex_unlock(&cache->ccc_max_cache_mb_lock);
+
+       seq_printf(m, "%ld\n", unevict_mb);
+       return 0;
+}
+
+static ssize_t ll_unevict_cached_mb_seq_write(struct file *file,
+                                             const char __user *buffer,
+                                             size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct lu_env *env;
+       __u16 refcheck;
+       char kernbuf[128];
+       int rc;
+
+       ENTRY;
+
+       if (count >= sizeof(kernbuf))
+               RETURN(-EINVAL);
+
+       if (copy_from_user(kernbuf, buffer, count))
+               RETURN(-EFAULT);
+
+       kernbuf[count] = 0;
+       if (count != 5 || strncmp(kernbuf, "clear", 5) != 0)
+               RETURN(-EINVAL);
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       /* being initialized */
+       if (sbi->ll_dt_exp == NULL)
+               GOTO(out, rc = -ENODEV);
+
+       rc = obd_set_info_async(env, sbi->ll_dt_exp,
+                               sizeof(KEY_UNEVICT_CACHE_SHRINK),
+                               KEY_UNEVICT_CACHE_SHRINK,
+                               0, NULL, NULL);
+out:
+       cl_env_put(env, &refcheck);
+       if (rc >= 0)
+               rc = count;
+
+       RETURN(rc);
+}
+LDEBUGFS_SEQ_FOPS(ll_unevict_cached_mb);
+
+static int ll_enable_mlock_pages_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct cl_client_cache *cache = sbi->ll_cache;
+
+       seq_printf(m, "%d\n", cache->ccc_mlock_pages_enable);
+       return 0;
+}
+
+static ssize_t ll_enable_mlock_pages_seq_write(struct file *file,
+                                              const char __user *buffer,
+                                              size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct cl_client_cache *cache = sbi->ll_cache;
+       bool val;
+       int rc;
+
+       rc = kstrtobool_from_user(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       cache->ccc_mlock_pages_enable = val;
+       return count;
+}
+LDEBUGFS_SEQ_FOPS(ll_enable_mlock_pages);
+
  static ssize_t pcc_async_threshold_show(struct kobject *kobj,
                                         struct attribute *attr, char *buffer)
  {
@@ -2261,6 +2357,10 @@ struct ldebugfs_vars lprocfs_llite_obd_vars[] = {
           .fops =       &ll_site_stats_fops                     },
         { .name =       "max_cached_mb",
           .fops =       &ll_max_cached_mb_fops                  },
+       { .name =       "unevict_cached_mb",
+         .fops =       &ll_unevict_cached_mb_fops              },
+       { .name =       "enable_mlock_pages",
+         .fops =       &ll_enable_mlock_pages_fops             },
         { .name =       "statahead_stats",
           .fops =       &ll_statahead_stats_fops                },
         { .name =       "unstable_stats",
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index 0ea0358..d315775 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -2149,6 +2149,8 @@ int ll_readpage(struct file *file, struct page *vmpage)
         } else {
                 unlock_page(vmpage);
                 result = PTR_ERR(page);
+               CDEBUG(D_CACHE, "failed to alloc page@%pK index%ld: rc = %d\n",
+                      vmpage, vmpage->index, result);
         }
  
  out:
diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c

index edc3be1..b3659d5 100644 (file)
--- a/lustre/llite/vvp_page.c
+++ b/lustre/llite/vvp_page.c
@@ -75,6 +75,8 @@ static void vvp_page_delete(const struct lu_env *env,
                 LASSERT(PageLocked(vmpage));
                 LASSERT((struct cl_page *)vmpage->private == cp);
  
+               CDEBUG(D_CACHE, "delete page %pK index %ld\n",
+                      vmpage, vmpage->index);
                 /* Drop the reference count held in vvp_page_init */
                 refcount_dec(&cp->cp_ref);
  
diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c

index f25ab21..a90316b 100644 (file)
--- a/lustre/obdclass/cl_page.c
+++ b/lustre/obdclass/cl_page.c
@@ -1159,6 +1159,7 @@ struct cl_client_cache *cl_cache_init(unsigned long lru_page_max)
         refcount_set(&cache->ccc_users, 1);
         cache->ccc_lru_max = lru_page_max;
         atomic_long_set(&cache->ccc_lru_left, lru_page_max);
+       atomic_long_set(&cache->ccc_unevict_lru_used, 0);
         spin_lock_init(&cache->ccc_lru_lock);
         INIT_LIST_HEAD(&cache->ccc_lru);
  
diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c

index 0bce55b..8f30ec8 100644 (file)
--- a/lustre/osc/lproc_osc.c
+++ b/lustre/osc/lproc_osc.c
@@ -191,10 +191,12 @@ static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
  
         seq_printf(m, "used_mb: %ld\n"
                    "busy_cnt: %ld\n"
+                  "unevict_cnt: %ld\n"
                    "reclaim: %llu\n",
                    (atomic_long_read(&cli->cl_lru_in_list) +
                     atomic_long_read(&cli->cl_lru_busy)) >> shift,
-                   atomic_long_read(&cli->cl_lru_busy),
+                  atomic_long_read(&cli->cl_lru_busy),
+                  atomic_long_read(&cli->cl_unevict_lru_in_list),
                    cli->cl_lru_reclaim);
  
         return 0;
@@ -244,6 +246,56 @@ static ssize_t osc_cached_mb_seq_write(struct file *file,
  
  LPROC_SEQ_FOPS(osc_cached_mb);
  
+static int osc_unevict_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+       struct client_obd *cli = &obd->u.cli;
+       int shift = 20 - PAGE_SHIFT;
+
+       seq_printf(m, "%ld\n",
+                  atomic_long_read(&cli->cl_unevict_lru_in_list) >> shift);
+       return 0;
+}
+
+static ssize_t osc_unevict_cached_mb_seq_write(struct file *file,
+                                              const char __user *buffer,
+                                              size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct obd_device *obd = m->private;
+       struct client_obd *cli = &obd->u.cli;
+       char kernbuf[128];
+
+       if (count >= sizeof(kernbuf))
+               return -EINVAL;
+
+       if (copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+
+       kernbuf[count] = 0;
+       if (count == 5 && strncmp(kernbuf, "clear", 5) == 0) {
+               struct lu_env *env;
+               __u16 refcheck;
+
+               env = cl_env_get(&refcheck);
+               if (!IS_ERR(env)) {
+                       (void)osc_unevict_cache_shrink(env, cli);
+                       /*
+                        * Scan the LRU list, discard the LRU pages or move
+                        * the unevictable/mlock()ed pages into the unevictable
+                        * list.
+                        */
+                       (void)osc_lru_shrink(env, cli,
+                               atomic_long_read(&cli->cl_lru_in_list), true);
+                       cl_env_put(env, &refcheck);
+               }
+               return count;
+       }
+
+       return -EINVAL;
+}
+LPROC_SEQ_FOPS(osc_unevict_cached_mb);
+
  static ssize_t cur_dirty_bytes_show(struct kobject *kobj,
                                     struct attribute *attr,
                                     char *buf)
@@ -711,6 +763,8 @@ struct lprocfs_vars lprocfs_osc_obd_vars[] = {
           .fops =       &osc_obd_max_pages_per_rpc_fops },
         { .name =       "osc_cached_mb",
           .fops =       &osc_cached_mb_fops             },
+       { .name =       "osc_unevict_cached_mb",
+         .fops =       &osc_unevict_cached_mb_fops     },
         { .name =       "cur_grant_bytes",
           .fops =       &osc_cur_grant_bytes_fops       },
         { .name =       "checksum_type",
diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h

index ee68cf4..39e366d 100644 (file)
--- a/lustre/osc/osc_internal.h
+++ b/lustre/osc/osc_internal.h
@@ -181,6 +181,8 @@ extern unsigned long osc_cache_shrink_count(struct shrinker *sk,
                                             struct shrink_control *sc);
  extern unsigned long osc_cache_shrink_scan(struct shrinker *sk,
                                            struct shrink_control *sc);
+extern long osc_unevict_cache_shrink(const struct lu_env *env,
+                                    struct client_obd *cli);
  static inline unsigned int osc_max_write_chunks(const struct client_obd *cli)
  {
         /*
diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c

index bb8b350..dbe8457 100644 (file)
--- a/lustre/osc/osc_page.c
+++ b/lustre/osc/osc_page.c
@@ -423,16 +423,33 @@ void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist)
                 cli->cl_lru_last_used = ktime_get_real_seconds();
                 spin_unlock(&cli->cl_lru_list_lock);
  
-               if (waitqueue_active(&osc_lru_waitq))
+               if (waitqueue_active(&osc_lru_waitq)) {
                         (void)ptlrpcd_queue_work(cli->cl_lru_work);
+                       CDEBUG(D_CACHE,
+                              "%s: cli %pK add LRU: i%ld/b%ld/u%ld/l%ld/m%ld %ld\n",
+                              cli_name(cli), cli,
+                              atomic_long_read(&cli->cl_lru_in_list),
+                              atomic_long_read(&cli->cl_lru_busy),
+                              atomic_long_read(&cli->cl_unevict_lru_in_list),
+                              atomic_long_read(cli->cl_lru_left),
+                              cli->cl_cache->ccc_lru_max, npages);
+               }
+
         }
  }
  
  static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg)
  {
-       LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0);
+       LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
+
         list_del_init(&opg->ops_lru);
-       atomic_long_dec(&cli->cl_lru_in_list);
+       if (opg->ops_vm_locked) {
+               atomic_long_dec(&cli->cl_unevict_lru_in_list);
+               atomic_long_dec(&cli->cl_cache->ccc_unevict_lru_used);
+               opg->ops_vm_locked = 0;
+       } else {
+               atomic_long_dec(&cli->cl_lru_in_list);
+       }
  }
  
  /**
@@ -442,8 +459,11 @@ static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg)
  static void osc_lru_del(struct client_obd *cli, struct osc_page *opg)
  {
         if (opg->ops_in_lru) {
+               bool mlocked = false;
+
                 spin_lock(&cli->cl_lru_list_lock);
                 if (!list_empty(&opg->ops_lru)) {
+                       mlocked = opg->ops_vm_locked;
                         __osc_lru_del(cli, opg);
                 } else {
                         LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0);
@@ -451,7 +471,8 @@ static void osc_lru_del(struct client_obd *cli, struct osc_page *opg)
                 }
                 spin_unlock(&cli->cl_lru_list_lock);
  
-               atomic_long_inc(cli->cl_lru_left);
+               if (!mlocked)
+                       atomic_long_inc(cli->cl_lru_left);
                 /* this is a great place to release more LRU pages if
                  * this osc occupies too many LRU pages and kernel is
                  * stealing one of them. */
@@ -528,49 +549,169 @@ static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page)
  }
  
  /**
- * Drop @target of pages from LRU at most.
+ * Check whether a page is mlocked and unevictable.
   */
-long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
-                  long target, bool force)
+static inline bool lru_page_unevictable(struct cl_page *clpage)
+{
+       return PageMlocked(cl_page_vmpage(clpage));
+}
+
+enum shrink_action {
+       SK_ACTION_WILL_FREE     = 0,
+       SK_ACTION_OWN_FAIL      = 1,
+       SK_ACTION_UNEVICT_ADD   = 2,
+       SK_ACTION_UNEVICT_DEL   = 3,
+       SK_ACTION_BUSY_SKIP     = 4,
+       SK_ACTION_INVAL         = 6,
+       SK_ACTION_MAX,
+};
+
+static inline bool
+cache_unevict_check_enabled(struct client_obd *cli)
+{
+       return cli->cl_cache->ccc_mlock_pages_enable;
+}
+
+static inline enum shrink_action
+osc_normal_lru_check(const struct lu_env *env, struct client_obd *cli,
+                    struct cl_io *io, struct osc_page *opg)
+{
+       struct cl_page *clpage = opg->ops_cl.cpl_page;
+       enum shrink_action action = SK_ACTION_OWN_FAIL;
+
+       if (cl_page_own_try(env, io, clpage) == 0) {
+               if (cache_unevict_check_enabled(cli) &&
+                   lru_page_unevictable(clpage)) {
+                       opg->ops_vm_locked = 1;
+                       cl_page_disown(env, io, clpage);
+                       list_move_tail(&opg->ops_lru,
+                                      &cli->cl_unevict_lru_list);
+                       return SK_ACTION_UNEVICT_ADD;
+               }
+               if (!lru_page_busy(cli, clpage)) {
+                       /*
+                        * remove it from lru list earlier to avoid
+                        * lock contention.
+                        */
+                       __osc_lru_del(cli, opg);
+                       opg->ops_in_lru = 0; /* will be discarded */
+
+                       cl_page_get(clpage);
+                       return SK_ACTION_WILL_FREE;
+               }
+
+               cl_page_disown(env, io, clpage);
+               action = SK_ACTION_BUSY_SKIP;
+       }
+
+       list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+
+       return action;
+}
+
+static inline enum shrink_action
+osc_unevict_lru_check(const struct lu_env *env, struct client_obd *cli,
+                     struct cl_io *io, struct osc_page *opg)
+{
+       struct cl_page *clpage = opg->ops_cl.cpl_page;
+       enum shrink_action action = SK_ACTION_OWN_FAIL;
+
+       if (cl_page_own_try(env, io, clpage) == 0) {
+               if (!lru_page_busy(cli, clpage) &&
+                   !lru_page_unevictable(clpage)) {
+                       LASSERT(opg->ops_vm_locked == 1);
+                       __osc_lru_del(cli, opg);
+                       opg->ops_in_lru = 0; /* will be discarded */
+
+                       cl_page_get(clpage);
+                       return SK_ACTION_UNEVICT_DEL;
+               }
+
+               cl_page_disown(env, io, clpage);
+               action = SK_ACTION_BUSY_SKIP;
+       }
+
+       list_move_tail(&opg->ops_lru, &cli->cl_unevict_lru_list);
+
+       return action;
+}
+
+/*
+ * Where some shrinker work was initiated.
+ */
+enum sk_reason {
+       SK_REASON_NORMAL_LRU,
+       SK_REASON_UNEVICT_LRU,
+};
+
+static inline enum shrink_action
+osc_lru_page_check(const struct lu_env *env, struct client_obd *cli,
+                  enum sk_reason reason, struct cl_io *io,
+                  struct osc_page *opg)
+{
+       switch (reason) {
+       case SK_REASON_NORMAL_LRU:
+               return osc_normal_lru_check(env, cli, io, opg);
+       case SK_REASON_UNEVICT_LRU:
+               return osc_unevict_lru_check(env, cli, io, opg);
+       default:
+               CERROR("%s: unsupport shrink type: %d\n",
+                      cli_name(cli), reason);
+               LBUG();
+               return SK_ACTION_INVAL;
+       }
+}
+
+static inline int osc_lru_maxscan(enum sk_reason reason, long *target,
+                                 bool force, atomic_long_t *lru_in_list)
+{
+       int maxscan;
+
+       if (force && reason == SK_REASON_UNEVICT_LRU) {
+               maxscan = atomic_long_read(lru_in_list);
+               if (*target == 0)
+                       *target = maxscan;
+       } else {
+               maxscan = min((*target) << 1, atomic_long_read(lru_in_list));
+       }
+
+       return maxscan;
+}
+
+static long osc_lru_list_shrink(const struct lu_env *env,
+                               struct client_obd *cli,
+                               enum sk_reason reason,
+                               struct list_head *lru_list,
+                               atomic_long_t *lru_in_list,
+                               long target, bool force,
+                               long *unevict_delta)
  {
-       struct cl_io *io;
         struct cl_object *clobj = NULL;
         struct cl_page **pvec;
         struct osc_page *opg;
+       struct cl_io *io;
         long count = 0;
-       int maxscan = 0;
         int index = 0;
+       int maxscan;
         int rc = 0;
+       enum shrink_action action;
+       int actnum[SK_ACTION_MAX] = { 0 };
+
         ENTRY;
  
-       LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
-       if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+       LASSERT(atomic_long_read(lru_in_list) >= 0);
+       if (atomic_long_read(lru_in_list) == 0 || target < 0)
                 RETURN(0);
  
-       CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n",
-              cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force);
-       if (!force) {
-               if (atomic_read(&cli->cl_lru_shrinkers) > 0)
-                       RETURN(-EBUSY);
-
-               if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) {
-                       atomic_dec(&cli->cl_lru_shrinkers);
-                       RETURN(-EBUSY);
-               }
-       } else {
-               atomic_inc(&cli->cl_lru_shrinkers);
-       }
-
         pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
         io = osc_env_thread_io(env);
  
         spin_lock(&cli->cl_lru_list_lock);
-       if (force)
+       if (force && reason == SK_REASON_NORMAL_LRU)
                 cli->cl_lru_reclaim++;
-       maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list));
-       while (!list_empty(&cli->cl_lru_list)) {
+       maxscan = osc_lru_maxscan(reason, &target, force, lru_in_list);
+       while (!list_empty(lru_list)) {
                 struct cl_page *page;
-               bool will_free = false;
  
                 if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1)
                         break;
@@ -578,11 +719,13 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
                 if (--maxscan < 0)
                         break;
  
-               opg = list_first_entry(&cli->cl_lru_list, struct osc_page,
-                                      ops_lru);
+               opg = list_first_entry(lru_list, struct osc_page, ops_lru);
                 page = opg->ops_cl.cpl_page;
-               if (lru_page_busy(cli, page)) {
-                       list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+               if (lru_page_busy(cli, page) &&
+                   !(reason == SK_REASON_NORMAL_LRU &&
+                     lru_page_unevictable(page))) {
+                       list_move_tail(&opg->ops_lru, lru_list);
+                       actnum[SK_ACTION_BUSY_SKIP]++;
                         continue;
                 }
  
@@ -617,24 +760,22 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
                         continue;
                 }
  
-               if (cl_page_own_try(env, io, page) == 0) {
-                       if (!lru_page_busy(cli, page)) {
-                               /* remove it from lru list earlier to avoid
-                                * lock contention */
-                               __osc_lru_del(cli, opg);
-                               opg->ops_in_lru = 0; /* will be discarded */
-
-                               cl_page_get(page);
-                               will_free = true;
-                       } else {
-                               cl_page_disown(env, io, page);
-                       }
-               }
-
-               if (!will_free) {
-                       list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+               action = osc_lru_page_check(env, cli, reason, io, opg);
+               actnum[action]++;
+               if (action == SK_ACTION_UNEVICT_ADD) {
+                       if (unevict_delta)
+                               (*unevict_delta)++;
+                       /*
+                        * The page is moved from the normal LRU list into
+                        * the unevict list.
+                        */
+                       if (++count >= target)
+                               break;
                         continue;
                 }
+               if (action != SK_ACTION_WILL_FREE &&
+                   action != SK_ACTION_UNEVICT_DEL)
+                       continue;
  
                 /* Don't discard and free the page with cl_lru_list held */
                 pvec[index++] = page;
@@ -649,6 +790,20 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
                 if (++count >= target)
                         break;
         }
+
+       CDEBUG(D_CACHE, "%s: LRU %s empty %d maxscan %d i%ld/u%ld/b%ld/l%ld actcnt %d/%d/%d/%d/%d count %ld\n",
+              cli_name(cli),
+              reason == SK_REASON_NORMAL_LRU ? "normal" : "unevict",
+              list_empty(lru_list), maxscan,
+              atomic_long_read(&cli->cl_lru_in_list),
+              atomic_long_read(&cli->cl_unevict_lru_in_list),
+              atomic_long_read(&cli->cl_lru_busy),
+              atomic_long_read(cli->cl_lru_left),
+              actnum[SK_ACTION_WILL_FREE],
+              actnum[SK_ACTION_OWN_FAIL],
+              actnum[SK_ACTION_UNEVICT_ADD],
+              actnum[SK_ACTION_UNEVICT_DEL],
+              actnum[SK_ACTION_BUSY_SKIP], count);
         spin_unlock(&cli->cl_lru_list_lock);
  
         if (clobj != NULL) {
@@ -659,12 +814,78 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
                 cond_resched();
         }
  
+       RETURN(count > 0 ? count : rc);
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+                  long target, bool force)
+{
+       struct cl_client_cache *cache = cli->cl_cache;
+       long unevict_delta = 0;
+       long shrank = 0;
+       long count = 0;
+
+       ENTRY;
+
+       LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
+       if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+               RETURN(0);
+
+       CDEBUG(D_CACHE,
+              "%s: shrinkers: %d force: %d target: %ld LRU: i%ld/u%ld/b%ld/l%ld\n",
+              cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force,
+              target, atomic_long_read(&cli->cl_lru_in_list),
+              atomic_long_read(&cli->cl_unevict_lru_in_list),
+              atomic_long_read(&cli->cl_lru_busy),
+              atomic_long_read(cli->cl_lru_left));
+       if (!force) {
+               if (atomic_read(&cli->cl_lru_shrinkers) > 0)
+                       RETURN(-EBUSY);
+
+               if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) {
+                       atomic_dec(&cli->cl_lru_shrinkers);
+                       RETURN(-EBUSY);
+               }
+       } else {
+               atomic_inc(&cli->cl_lru_shrinkers);
+       }
+
+       count = osc_lru_list_shrink(env, cli, SK_REASON_NORMAL_LRU,
+                                   &cli->cl_lru_list, &cli->cl_lru_in_list,
+                                   target, force, &unevict_delta);
+       if (count < 0)
+               GOTO(out, count);
+
+       shrank = count;
+       if (force)
+               GOTO(out, count);
+
+       /*
+        * TODO: In non force mode, should we also scan unevictable list and try
+        * to free some pages that are no longer marked as PG_mlocked here?
+        */
+out:
         atomic_dec(&cli->cl_lru_shrinkers);
-       if (count > 0) {
-               atomic_long_add(count, cli->cl_lru_left);
+       if (unevict_delta > 0) {
+               atomic_long_sub(unevict_delta, &cli->cl_lru_in_list);
+               atomic_long_add(unevict_delta, &cli->cl_unevict_lru_in_list);
+               atomic_long_add(unevict_delta, &cache->ccc_unevict_lru_used);
+       }
+       if (shrank > 0) {
+               atomic_long_add(shrank, cli->cl_lru_left);
+               CDEBUG(D_CACHE,
+                      "%s: LRU shrink %ld i%ld/u%ld/b%ld/l%ld\n",
+                      cli_name(cli), shrank,
+                      atomic_long_read(&cli->cl_lru_in_list),
+                      atomic_long_read(&cli->cl_unevict_lru_in_list),
+                      atomic_long_read(&cli->cl_lru_busy),
+                      atomic_long_read(cli->cl_lru_left));
                 wake_up(&osc_lru_waitq);
         }
-       RETURN(count > 0 ? count : rc);
+       RETURN(shrank > 0 ? shrank : count);
  }
  EXPORT_SYMBOL(osc_lru_shrink);
  
@@ -681,7 +902,9 @@ static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
         struct client_obd *scan;
         int max_scans;
         __u16 refcheck;
+       long shrank = 0;
         long rc = 0;
+
         ENTRY;
  
         LASSERT(cache != NULL);
@@ -699,14 +922,20 @@ static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
                        cli_name(cli), rc, npages);
                 if (osc_cache_too_much(cli) > 0)
                         ptlrpcd_queue_work(cli->cl_lru_work);
+               shrank = rc;
                 GOTO(out, rc);
         } else if (rc > 0) {
+               shrank = rc;
                 npages -= rc;
         }
  
-       CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n",
-               cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
-               atomic_long_read(&cli->cl_lru_busy), npages);
+       CDEBUG(D_CACHE,
+              "%s: cli %p no free slots, pages: i%ld/u%ld/b%ld/l%ld/m%ld, want: %ld\n",
+              cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
+              atomic_long_read(&cli->cl_unevict_lru_in_list),
+              atomic_long_read(&cli->cl_lru_busy),
+              atomic_long_read(cli->cl_lru_left),
+              cli->cl_cache->ccc_lru_max, npages);
  
         /* Reclaim LRU slots from other client_obd as it can't free enough
          * from its own. This should rarely happen. */
@@ -721,10 +950,12 @@ static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
                (scan = list_first_entry_or_null(&cache->ccc_lru,
                                                   struct client_obd,
                                                   cl_lru_osc)) != NULL) {
-               CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n",
+               CDEBUG(D_CACHE,
+                      "%s: cli %p LRU pages: %ld, busy: %ld, unevict: %ld.\n",
                        cli_name(scan), scan,
                        atomic_long_read(&scan->cl_lru_in_list),
-                      atomic_long_read(&scan->cl_lru_busy));
+                      atomic_long_read(&scan->cl_lru_busy),
+                      atomic_long_read(&scan->cl_unevict_lru_in_list));
  
                 list_move_tail(&scan->cl_lru_osc, &cache->ccc_lru);
                 if (osc_cache_too_much(scan) > 0) {
@@ -732,19 +963,25 @@ static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
  
                         rc = osc_lru_shrink(env, scan, npages, true);
                         spin_lock(&cache->ccc_lru_lock);
-                       if (rc >= npages)
+                       if (rc >= npages) {
+                               shrank += rc;
                                 break;
-                       if (rc > 0)
+                       }
+                       if (rc > 0) {
+                               shrank += rc;
                                 npages -= rc;
+                       }
                 }
         }
         spin_unlock(&cache->ccc_lru_lock);
  
+       if (shrank > 0)
+               GOTO(out, rc);
  out:
         cl_env_put(env, &refcheck);
-       CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n",
-              cli_name(cli), cli, rc);
-       return rc;
+       CDEBUG(D_CACHE, "%s: cli %p freed %ld/%ld pages.\n",
+              cli_name(cli), cli, rc, shrank);
+       return shrank > 0 ? shrank : rc;
  }
  
  /**
@@ -877,6 +1114,20 @@ void osc_lru_unreserve(struct client_obd *cli, unsigned long npages)
         wake_up(&osc_lru_waitq);
  }
  
+long osc_unevict_cache_shrink(const struct lu_env *env, struct client_obd *cli)
+{
+       long rc;
+
+       ENTRY;
+
+       rc = osc_lru_list_shrink(env, cli, SK_REASON_UNEVICT_LRU,
+                                &cli->cl_unevict_lru_list,
+                                &cli->cl_unevict_lru_in_list,
+                                0, true, NULL);
+
+       RETURN(rc);
+}
+
  /**
   * Atomic operations are expensive. We accumulate the accounting for the
   * same page zone to get better performance.
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c

index 8db5523..180ae77 100644 (file)
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -3531,6 +3531,27 @@ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
                 RETURN(0);
         }
  
+       if (KEY_IS(KEY_UNEVICT_CACHE_SHRINK)) {
+               struct client_obd *cli = &obd->u.cli;
+               long ret;
+
+               ret = osc_unevict_cache_shrink(env, cli);
+               if (ret > 0)
+                       ret = 0;
+
+               /*
+                * Clear unused cache pages and move mlock()ed pages from
+                * the normal LRU list into unevictable LRU list.
+                */
+               ret = osc_lru_shrink(env, cli,
+                                    atomic_long_read(&cli->cl_lru_in_list),
+                                    true);
+               if (ret > 0)
+                       ret = 0;
+
+               RETURN(ret);
+       }
+
         if (!set && !KEY_IS(KEY_GRANT_SHRINK))
                 RETURN(-EINVAL);
  
@@ -4002,6 +4023,9 @@ static struct ll_shrinker_ops osc_cache_sh_ops = {
  static int osc_cache_shrink(struct shrinker *shrinker,
                             struct shrink_control *sc)
  {
+       if (!osc_page_cache_shrink_enabled)
+               return 0;
+
         (void)osc_cache_shrink_scan(shrinker, sc);
  
         return osc_cache_shrink_count(shrinker, sc);
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index 7df2c24..39db647 100755 (executable)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -31909,6 +31909,262 @@ test_460d() {
  }
  run_test 460d "Check encrypt pools output"
  
+resident_pages() {
+       local file=$1
+
+       vmtouch $file | awk '/Resident Pages:/ {print $3}' |
+               awk -F/ '{ print $1 }'
+}
+
+# The command "echo 2 > /proc/sys/vm/drop_caches" may revoke the DLM locks
+# due to slab cache reclaim. Thus we should avoid to reclaim slab cache for
+# DLM locks during testing since it may evict mlock()ed pages due to the
+# release of the DLM extent lock.
+# After the page cache shrinker is disabled, "echo 3 > /proc/sys/vm/drop_caches"
+# and "echo 2 > /proc/sys/vm/drop_caches" will not scan and clear unused pages
+# from the LRU list.
+disable_page_cache_shrink() {
+       local enabled=$($LCTL get_param -n osc.*.enable_page_cache_shrink |
+                       head -n 1)
+
+       stack_trap "$LCTL set_param osc.*.enable_page_cache_shrink=$enabled"
+       $LCTL set_param osc.*.enable_page_cache_shrink=0
+}
+
+enable_mlock_pages_check() {
+       local enabled=$($LCTL get_param -n llite.*.enable_mlock_pages)
+
+       stack_trap "$LCTL set_param llite.*.enable_mlock_pages=$enabled"
+       $LCTL set_param llite.*.enable_mlock_pages=1
+}
+
+test_600a() {
+       local file=$DIR/$tfile
+       local size_mb=100
+       local pcnt=$((size_mb * 1024 * 1024 / PAGE_SIZE))
+
+       which vmtouch || skip_env "This test needs vmtouch utility"
+       check_set_fallocate_or_skip
+       disable_page_cache_shrink
+       enable_mlock_pages_check
+
+       fallocate -l ${size_mb}M $file || error "failed to fallocate $file"
+       stack_trap "pkill -9 vmtouch || true"
+       vmtouch -vltdw -m 1g $file || error "failed to vmtouch $file"
+
+       local rcnt=$(resident_pages $file)
+
+       echo "before drop_caches (0):"
+       grep Mlocked: /proc/meminfo
+       $LCTL get_param llite.*.max_cached_mb
+       echo "drop page caches (1):"
+       echo 1 > /proc/sys/vm/drop_caches
+       grep Mlocked: /proc/meminfo
+       $LCTL get_param llite.*.max_cached_mb
+       vmtouch $file
+       (( $pcnt == $rcnt )) || error "resident pages are $rcnt, expected $pcnt"
+
+       local unevict_mb
+
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       $LCTL get_param llite.*.unevict_cached_mb
+       unevict_mb=$($LCTL get_param -n llite.*.unevict_cached_mb)
+       (( $unevict_mb == $size_mb )) ||
+               error "unevict_cached_mb is $unevict_mb, expected $size_mb"
+
+       $LCTL set_param $OSC.*$OSC*.osc_unevict_cached_mb=clear
+       $LCTL get_param $OSC.*$OSC*.osc_unevict_cached_mb
+       unevict_mb=$($LCTL get_param -n $OSC.*$OSC*.osc_unevict_cached_mb |
+                    awk '{sum += $1 } END { print sum }')
+       (( $unevict_mb == $size_mb )) ||
+               error "osc_unevict_cached_mb is $unevict_mb, expected $size_mb"
+
+       # The lock revocation will evict the cached pages protected by it.
+       # This is desired behavior for conflict access from the remote client.
+       # But how to deal with the lock revocation triggered by LRU lock
+       # shrinking on client side, should this kind of locks that protected
+       # the mlocked pages be canceled in this case? Or the lock protecting
+       # mlock()ed pages should not put into lock LRU list?
+       cancel_lru_locks $OSC
+       echo "drop lru DLM lock:"
+       grep Mlocked: /proc/meminfo
+       $LCTL get_param llite.*.max_cached_mb
+       $LCTL get_param osc.*.osc_cached_mb
+       rcnt=$(resident_pages $file)
+       (( $rcnt == 0 )) || error "resident pages are $rcnt, expected zero"
+       unevict_mb=$($LCTL get_param -n llite.*.unevict_cached_mb)
+       (( $unevict_mb == 0 )) ||
+               error "unevict_cached_mb is $unevict_mb, expected 0"
+       unevict_mb=$($LCTL get_param -n $OSC.*$OSC*.osc_unevict_cached_mb |
+                    awk '{sum += $1 } END { print sum }')
+       (( $unevict_mb == 0 )) ||
+               error "osc_unevict_cached_mb is $unevict_mb, expected $size_mb"
+
+}
+run_test 600a "basic test for mlock()ed file"
+
+test_600b() {
+       local file=$DIR/$tfile
+       local size_mb=100
+       local cache_limit=64
+       local max_cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                             awk '/^max_cached_mb/ { print $2 }')
+
+       which vmtouch || skip_env "This test needs vmtouch utility"
+       check_set_fallocate_or_skip
+       disable_page_cache_shrink
+       enable_mlock_pages_check
+
+       fallocate -l ${size_mb}M $file || error "failed to fallocate $file"
+       stack_trap "pkill -9 vmtouch || true"
+
+       cancel_lru_locks $OSC
+       $LCTL get_param llite.*.max_cached_mb
+       stack_trap "$LCTL set_param llite.*.max_cached_mb=$max_cached_mb"
+       $LCTL set_param llite.*.max_cached_mb=$cache_limit
+
+       # The required mlock()ed pages (100M) are larger than @max_cached_mb.
+       vmtouch -vltdw -m 1g $file || error "failed to mlock $file"
+       vmtouch $file
+       grep Mlocked: /proc/meminfo
+
+       local used_mb
+       local unevict_mb
+
+       echo 1 > /proc/sys/vm/drop_caches
+       $LCTL get_param llite.*.max_cached_mb
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       used_mb=$($LCTL get_param llite.*.max_cached_mb |
+                 awk '/^used_mb/ { print $2 }')
+       unevict_mb=$($LCTL get_param -n llite.*.unevict_cached_mb)
+       (( $used_mb == 0 )) || error "used_mb is $used_mb, expected 0"
+       (( $unevict_mb == $size_mb )) ||
+               error "unevict_mb is $unevict_mb, expected $size_mb"
+}
+run_test 600b "mlock a file (via vmtouch) larger than max_cached_mb"
+
+test_600c() {
+       local dir=$DIR/$tdir
+       local cache_limit=64
+       local max_cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                             awk '/^max_cached_mb/ { print $2 }')
+
+       which vmtouch || skip_env "This test needs vmtouch utility"
+       check_set_fallocate_or_skip
+       disable_page_cache_shrink
+       enable_mlock_pages_check
+
+       stack_trap "rm -rf $dir"
+       stack_trap "$LCTL set_param llite.*.max_cached_mb=$max_cached_mb"
+       $LCTL set_param llite.*.max_cached_mb=$cache_limit
+       stack_trap "pkill -9 vmtouch || true"
+
+       local size=$((64 * 1048576))
+       local file1=$dir/$tfile.1
+       local file2=$dir/$tfile.2
+
+       mkdir $dir || error "failed to mkdir $dir"
+       fallocate -l $size $file1 || error "failed to fallocate $file1"
+       fallocate -l $size $file2 || error "failed to fallocate $file2"
+       cancel_lru_locks $OSC
+
+       vmtouch -vltdw -m 1g $file1 || error "failed to vmtouch $file1"
+       $LCTL get_param llite.*.max_cached_mb
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       $LCTL get_param llite.*.max_cached_mb
+
+       local cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                         awk '/^used_mb/ { print $2 }')
+
+       [ $cached_mb -eq 0 ] || error "expected used_mb 0 got $cached_mb"
+       cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                   awk '/^unevict_mb/ { print $2 }')
+       [ $cached_mb -eq 64 ] || error "expected unevict_mb 64 got $cached_mb"
+
+       vmtouch -vt $file2 || error "failed to vmtouch $file2"
+       echo 3 > /proc/sys/vm/drop_caches
+       dd if=$file2 of=/dev/null bs=1M count=64 ||
+               error "failed to reading $file2 into cache"
+
+       pkill -9 vmtouch || error "failed to kill vmtouch"
+       vmtouch -vt $file2 || error "failed to load $files into cache"
+       $LCTL get_param llite.*.max_cached_mb
+       echo 1 > /proc/sys/vm/drop_caches
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                   awk '/^used_mb/ { print $2 }')
+       [ $cached_mb -eq 0 ] || error "expected used_mb 0 got $cached_mb"
+       cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                   awk '/^unevict_mb/ { print $2 }')
+       [ $cached_mb -eq 0 ] || error "expected unevict_mb 0 got $cached_mb"
+}
+run_test 600c "Test I/O when mlocked page count > @max_cached_mb"
+
+test_600d_base() {
+       local mlcksz=$1
+       local fsz=$2
+       local n=$3
+       local dir=$DIR/$tdir
+       local mlckf=$dir/mlockfile
+
+       echo "mlock size: $mlcksz file size: $fsz, n: $n"
+       mkdir -p $dir || error "mkdir $dir failed"
+
+       fallocate -l $mlcksz $mlckf || error "failed to fallocate $mlckf"
+       for ((i = 0; i < $n; i++)); do
+               fallocate -l $fsz $dir/$tfile.$i ||
+                       error "failed to fallocate $dir/$tfile.$i"
+       done
+
+       cancel_lru_locks $OSC
+
+       declare -a pids
+
+       vmtouch -vltdw -m 1G $mlckf || error "failed to mlock $mlckf"
+       for ((i = 0; i < $n; i++)); do
+               vmtouch -t -m 1g $dir/$tfile.$i &
+               pids[i]=$!
+       done
+
+       cat /proc/meminfo | grep 'Mlocked'
+       $LCTL get_param llite.*.max_cached_mb
+       echo "drop caches:"
+       echo 1 > /proc/sys/vm/drop_caches
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       $LCTL get_param llite.*.max_cached_mb
+
+       for ((i = 0; i < $n; i++)); do
+               wait ${pids[i]} || error "touch $dir/$tfile.$i failed: rc = $?"
+       done
+
+       cat /proc/meminfo | grep 'Mlocked:'
+       pkill -9 vmtouch || true
+       rm -rvf $dir || error "failed to rm $dir"
+}
+
+test_600d() {
+       local dir=$DIR/$tdir
+       local cache_limit=64
+       local max_cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                             awk '/^max_cached_mb/ { print $2 }')
+
+       which vmtouch || skip_env "This test needs vmtouch utility"
+       check_set_fallocate_or_skip
+       disable_page_cache_shrink
+       enable_mlock_pages_check
+
+       stack_trap "rm -rf $dir"
+       stack_trap "$LCTL set_param llite.*.max_cached_mb=$max_cached_mb"
+       $LCTL set_param llite.*.max_cached_mb=$cache_limit
+       stack_trap "pkill -9 vmtouch || true"
+
+       local size=$((cache_limit * 1048576))
+
+       test_600d_base $((size - PAGE_SIZE)) 4096 16
+       test_600d_base $((size - 2 * PAGE_SIZE)) 16384 16
+}
+run_test 600d "Test I/O with limited LRU page slots (some was mlocked)"
+
  prep_801() {
         [[ $MDS1_VERSION -lt $(version_code 2.9.55) ]] ||
         [[ $OST1_VERSION -lt $(version_code 2.9.55) ]] &&
author	Qian Yingjin <qian@ddn.com>
	Fri, 26 Jan 2024 07:49:34 +0000 (02:49 -0500)
committer	Oleg Drokin <green@whamcloud.com>
	Fri, 23 Aug 2024 21:57:16 +0000 (21:57 +0000)
lustre.spec.in		patch \| blob \| history
lustre/include/cl_object.h		patch \| blob \| history
lustre/include/lustre_osc.h		patch \| blob \| history
lustre/include/obd.h		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/llite/llite_mmap.c		patch \| blob \| history
lustre/llite/lproc_llite.c		patch \| blob \| history
lustre/llite/rw.c		patch \| blob \| history
lustre/llite/vvp_page.c		patch \| blob \| history
lustre/obdclass/cl_page.c		patch \| blob \| history
lustre/osc/lproc_osc.c		patch \| blob \| history
lustre/osc/osc_internal.h		patch \| blob \| history
lustre/osc/osc_page.c		patch \| blob \| history
lustre/osc/osc_request.c		patch \| blob \| history
lustre/tests/sanity.sh		patch \| blob \| history