Whamcloud - gitweb
LU-17463 osc: add support for unevictable mlock()ed pages 26/53826/16
authorQian Yingjin <qian@ddn.com>
Fri, 26 Jan 2024 07:49:34 +0000 (02:49 -0500)
committerOleg Drokin <green@whamcloud.com>
Fri, 23 Aug 2024 21:57:16 +0000 (21:57 +0000)
The page cache shrinker does not distinguish mlock()ed pages from
normal unused pages in page LRU list and would remove it from
cache wrongly.

In this patch, we use a separate unevictable list to manage the
pages marked with PG_mlocked flag.
As there is no direct notification or interface for page mlock()/
munlock() to the filesystem, we need to scan the whole unevitable
list heavily to check whether there are any freeable pages in the
list to remove them later.

Thus we implement two interfaces to scan and shrink unevictable
pages that no longer marked with PG_mlocked and remove them
from cache manually.
- $LCTL set_param llite.*.unevict_cached_mb=clear
  Scan and clear all pages in unevictable lists that no
  longer marked with PG_mlocked on a Lustre client FS.
  Scan the normal LRU list to move the pages marked with
  PG_mlocked into the unevictable list.
- $LCTL set_param osc.*.osc_unevict_cached_mb=clear
  Scan and clear all pages that no longer marked with PG_mlocked
  in the unevictable list corresponding to this OSC obd device.
  Scan the normal LRU list to move the pages marked with
  PG_mlocked into the unevictable list.

The command "echo 3 > /proc/sys/vm/drop_caches" can also be
used to evict the pages unlocked by munlock() system call which
are no longer marked with PG_mlocked.

In our design, the mlocked pages are not accounted into the LRU.
Thus, the total cached pages in a Lustre client FS may exceed
the maximum LRU limit @max_cached_mb on a Lustre client FS.

Add a tunable parameter to control whether the mlocked pages
functionality is enabled:
llite.*.enable_mlock_pages
This will allow disabling this feature if something goes wrong in
the feild.
It is disabled by default.

Add test case sanity/test_600{a,b,c,d}.

Signed-off-by: Qian Yingjin <qian@ddn.com>
Change-Id: I0713ad254999dfc32ed5063ec0d8e042968793a9
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53826
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Patrick Farrell <patrick.farrell@oracle.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
15 files changed:
lustre.spec.in
lustre/include/cl_object.h
lustre/include/lustre_osc.h
lustre/include/obd.h
lustre/ldlm/ldlm_lib.c
lustre/llite/llite_mmap.c
lustre/llite/lproc_llite.c
lustre/llite/rw.c
lustre/llite/vvp_page.c
lustre/obdclass/cl_page.c
lustre/osc/lproc_osc.c
lustre/osc/osc_internal.h
lustre/osc/osc_page.c
lustre/osc/osc_request.c
lustre/tests/sanity.sh

index e157c0c..0d83191 100644 (file)
@@ -478,7 +478,7 @@ Requires: attr, rsync, lsof, /usr/bin/getconf
 Requires: /usr/sbin/getenforce, acl, /usr/bin/killall, /usr/bin/ping, bc
 # Of the supported targets, only rhel7 doesn't support Recommends.
 %if 0%{?rhel} > 7 || 0%{?fedora} > 33 || 0%{?rhel} < 1
-Recommends: perl, dbench, iozone
+Recommends: perl, dbench, iozone, vmtouch
 # Either of these is sufficient
 Suggests: pdsh, clush
 %endif
index 0c6c81a..afe3dd8 100644 (file)
@@ -2284,6 +2284,10 @@ struct cl_client_cache {
         */
        atomic_long_t           ccc_lru_left;
        /**
+        * # of unevictable LRU entries
+        */
+       atomic_long_t           ccc_unevict_lru_used;
+       /**
         * List of entities(OSCs) for this LRU cache
         */
        struct list_head        ccc_lru;
@@ -2298,7 +2302,11 @@ struct cl_client_cache {
        /**
         * Set if unstable check is enabled
         */
-       unsigned int            ccc_unstable_check:1;
+       unsigned int            ccc_unstable_check:1,
+       /**
+        * Whether unevictable (mlock pages) checking is enabled
+        */
+                               ccc_mlock_pages_enable:1;
        /**
         * # of unstable pages for this mount point
         */
index 4713551..aba4a71 100644 (file)
@@ -529,7 +529,11 @@ struct osc_page {
        /**
         * If the page is in osc_object::oo_tree.
         */
-                               ops_intree:1;
+                               ops_intree:1,
+       /**
+        * If the page is marked with PG_mlocked.
+        */
+                               ops_vm_locked:1;
        /**
         * lru page list. See osc_lru_{del|use}() in osc_page.c for usage.
         */
index 41d9211..061119c 100644 (file)
@@ -295,6 +295,10 @@ struct client_obd {
        atomic_long_t            cl_lru_busy;
        /** # of LRU pages in the cache for this client_obd */
        atomic_long_t            cl_lru_in_list;
+       /**
+        * # of LRU pages marked with PG_mlocked in the cache on the client.
+        */
+       atomic_long_t            cl_unevict_lru_in_list;
        /** # of threads are shrinking LRU cache. To avoid contention, it's not
         * allowed to have multiple threads shrinking LRU cache. */
        atomic_t                 cl_lru_shrinkers;
@@ -305,6 +309,8 @@ struct client_obd {
         * reclaim is sync, initiated by IO thread when the LRU slots are
         * in shortage. */
        __u64                    cl_lru_reclaim;
+       /** List of unevictable LRU pages for this client_obd */
+       struct list_head         cl_unevict_lru_list;
        /** List of LRU pages for this client_obd */
        struct list_head         cl_lru_list;
        /** Lock for LRU page list */
@@ -842,6 +848,8 @@ static inline bool obd_mdt_recovery_abort(struct obd_device *obd)
 #define KEY_CACHE_LRU_SHRINK   "cache_lru_shrink"
 #define KEY_OSP_CONNECTED      "osp_connected"
 
+#define KEY_UNEVICT_CACHE_SHRINK       "unevict_cache_shrink"
+
 /* Flags for op_xvalid */
 enum op_xvalid {
        OP_XVALID_CTIME_SET     = BIT(0),       /* 0x0001 */
index fdc0bb7..539e13c 100644 (file)
@@ -408,7 +408,9 @@ int client_obd_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
        atomic_set(&cli->cl_lru_shrinkers, 0);
        atomic_long_set(&cli->cl_lru_busy, 0);
        atomic_long_set(&cli->cl_lru_in_list, 0);
+       atomic_long_set(&cli->cl_unevict_lru_in_list, 0);
        INIT_LIST_HEAD(&cli->cl_lru_list);
+       INIT_LIST_HEAD(&cli->cl_unevict_lru_list);
        spin_lock_init(&cli->cl_lru_list_lock);
        atomic_long_set(&cli->cl_unstable_count, 0);
        INIT_LIST_HEAD(&cli->cl_shrink_list);
index ee7ea29..a40920f 100644 (file)
@@ -401,10 +401,11 @@ static vm_fault_t ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                return result;
 
        CDEBUG(D_MMAP|D_IOTRACE,
-              "START file %s:"DFID", vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n",
+              "START file %s:"DFID", vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu vmf_flags=%#x\n",
               file_dentry(vma->vm_file)->d_name.name,
               PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid),
-              vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff);
+              vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff,
+              vmf->flags);
 
        /* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
         * so that it can be killed by admin but not cause segfault by
@@ -461,10 +462,10 @@ restart:
        }
 
        CDEBUG(D_IOTRACE,
-              "COMPLETED: "DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu, rc %d\n",
+              "COMPLETED: "DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu vmf_flags=%#x: rc=%d\n",
               PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid),
               vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff,
-              result);
+              vmf->flags, result);
 
        return result;
 }
index 2e41ae4..956bad6 100644 (file)
@@ -463,16 +463,20 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
        struct ll_ra_info *ra = &sbi->ll_ra_info;
        long max_cached_mb;
        long unused_mb;
+       long unevict_mb;
 
        mutex_lock(&cache->ccc_max_cache_mb_lock);
        max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max);
        unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left));
+       unevict_mb = PAGES_TO_MiB(
+                       atomic_long_read(&cache->ccc_unevict_lru_used));
        mutex_unlock(&cache->ccc_max_cache_mb_lock);
 
        seq_printf(m, "users: %d\n"
                      "max_cached_mb: %ld\n"
                      "used_mb: %ld\n"
                      "unused_mb: %ld\n"
+                     "unevict_mb: %ld\n"
                      "reclaim_count: %u\n"
                      "max_read_ahead_mb: %lu\n"
                      "used_read_ahead_mb: %d\n",
@@ -480,6 +484,7 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
                   max_cached_mb,
                   max_cached_mb - unused_mb,
                   unused_mb,
+                  unevict_mb,
                   cache->ccc_lru_shrinkers,
                   PAGES_TO_MiB(ra->ra_max_pages),
                   PAGES_TO_MiB(atomic_read(&ra->ra_cur_pages)));
@@ -614,6 +619,97 @@ out_unlock:
 }
 LDEBUGFS_SEQ_FOPS(ll_max_cached_mb);
 
+static int ll_unevict_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct cl_client_cache *cache = sbi->ll_cache;
+       long unevict_mb;
+
+       mutex_lock(&cache->ccc_max_cache_mb_lock);
+       unevict_mb = PAGES_TO_MiB(
+                       atomic_long_read(&cache->ccc_unevict_lru_used));
+       mutex_unlock(&cache->ccc_max_cache_mb_lock);
+
+       seq_printf(m, "%ld\n", unevict_mb);
+       return 0;
+}
+
+static ssize_t ll_unevict_cached_mb_seq_write(struct file *file,
+                                             const char __user *buffer,
+                                             size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct lu_env *env;
+       __u16 refcheck;
+       char kernbuf[128];
+       int rc;
+
+       ENTRY;
+
+       if (count >= sizeof(kernbuf))
+               RETURN(-EINVAL);
+
+       if (copy_from_user(kernbuf, buffer, count))
+               RETURN(-EFAULT);
+
+       kernbuf[count] = 0;
+       if (count != 5 || strncmp(kernbuf, "clear", 5) != 0)
+               RETURN(-EINVAL);
+
+       env = cl_env_get(&refcheck);
+       if (IS_ERR(env))
+               RETURN(PTR_ERR(env));
+
+       /* being initialized */
+       if (sbi->ll_dt_exp == NULL)
+               GOTO(out, rc = -ENODEV);
+
+       rc = obd_set_info_async(env, sbi->ll_dt_exp,
+                               sizeof(KEY_UNEVICT_CACHE_SHRINK),
+                               KEY_UNEVICT_CACHE_SHRINK,
+                               0, NULL, NULL);
+out:
+       cl_env_put(env, &refcheck);
+       if (rc >= 0)
+               rc = count;
+
+       RETURN(rc);
+}
+LDEBUGFS_SEQ_FOPS(ll_unevict_cached_mb);
+
+static int ll_enable_mlock_pages_seq_show(struct seq_file *m, void *v)
+{
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct cl_client_cache *cache = sbi->ll_cache;
+
+       seq_printf(m, "%d\n", cache->ccc_mlock_pages_enable);
+       return 0;
+}
+
+static ssize_t ll_enable_mlock_pages_seq_write(struct file *file,
+                                              const char __user *buffer,
+                                              size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct super_block *sb = m->private;
+       struct ll_sb_info *sbi = ll_s2sbi(sb);
+       struct cl_client_cache *cache = sbi->ll_cache;
+       bool val;
+       int rc;
+
+       rc = kstrtobool_from_user(buffer, count, &val);
+       if (rc)
+               return rc;
+
+       cache->ccc_mlock_pages_enable = val;
+       return count;
+}
+LDEBUGFS_SEQ_FOPS(ll_enable_mlock_pages);
+
 static ssize_t pcc_async_threshold_show(struct kobject *kobj,
                                        struct attribute *attr, char *buffer)
 {
@@ -2261,6 +2357,10 @@ struct ldebugfs_vars lprocfs_llite_obd_vars[] = {
          .fops =       &ll_site_stats_fops                     },
        { .name =       "max_cached_mb",
          .fops =       &ll_max_cached_mb_fops                  },
+       { .name =       "unevict_cached_mb",
+         .fops =       &ll_unevict_cached_mb_fops              },
+       { .name =       "enable_mlock_pages",
+         .fops =       &ll_enable_mlock_pages_fops             },
        { .name =       "statahead_stats",
          .fops =       &ll_statahead_stats_fops                },
        { .name =       "unstable_stats",
index 0ea0358..d315775 100644 (file)
@@ -2149,6 +2149,8 @@ int ll_readpage(struct file *file, struct page *vmpage)
        } else {
                unlock_page(vmpage);
                result = PTR_ERR(page);
+               CDEBUG(D_CACHE, "failed to alloc page@%pK index%ld: rc = %d\n",
+                      vmpage, vmpage->index, result);
        }
 
 out:
index edc3be1..b3659d5 100644 (file)
@@ -75,6 +75,8 @@ static void vvp_page_delete(const struct lu_env *env,
                LASSERT(PageLocked(vmpage));
                LASSERT((struct cl_page *)vmpage->private == cp);
 
+               CDEBUG(D_CACHE, "delete page %pK index %ld\n",
+                      vmpage, vmpage->index);
                /* Drop the reference count held in vvp_page_init */
                refcount_dec(&cp->cp_ref);
 
index f25ab21..a90316b 100644 (file)
@@ -1159,6 +1159,7 @@ struct cl_client_cache *cl_cache_init(unsigned long lru_page_max)
        refcount_set(&cache->ccc_users, 1);
        cache->ccc_lru_max = lru_page_max;
        atomic_long_set(&cache->ccc_lru_left, lru_page_max);
+       atomic_long_set(&cache->ccc_unevict_lru_used, 0);
        spin_lock_init(&cache->ccc_lru_lock);
        INIT_LIST_HEAD(&cache->ccc_lru);
 
index 0bce55b..8f30ec8 100644 (file)
@@ -191,10 +191,12 @@ static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
 
        seq_printf(m, "used_mb: %ld\n"
                   "busy_cnt: %ld\n"
+                  "unevict_cnt: %ld\n"
                   "reclaim: %llu\n",
                   (atomic_long_read(&cli->cl_lru_in_list) +
                    atomic_long_read(&cli->cl_lru_busy)) >> shift,
-                   atomic_long_read(&cli->cl_lru_busy),
+                  atomic_long_read(&cli->cl_lru_busy),
+                  atomic_long_read(&cli->cl_unevict_lru_in_list),
                   cli->cl_lru_reclaim);
 
        return 0;
@@ -244,6 +246,56 @@ static ssize_t osc_cached_mb_seq_write(struct file *file,
 
 LPROC_SEQ_FOPS(osc_cached_mb);
 
+static int osc_unevict_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+       struct obd_device *obd = m->private;
+       struct client_obd *cli = &obd->u.cli;
+       int shift = 20 - PAGE_SHIFT;
+
+       seq_printf(m, "%ld\n",
+                  atomic_long_read(&cli->cl_unevict_lru_in_list) >> shift);
+       return 0;
+}
+
+static ssize_t osc_unevict_cached_mb_seq_write(struct file *file,
+                                              const char __user *buffer,
+                                              size_t count, loff_t *off)
+{
+       struct seq_file *m = file->private_data;
+       struct obd_device *obd = m->private;
+       struct client_obd *cli = &obd->u.cli;
+       char kernbuf[128];
+
+       if (count >= sizeof(kernbuf))
+               return -EINVAL;
+
+       if (copy_from_user(kernbuf, buffer, count))
+               return -EFAULT;
+
+       kernbuf[count] = 0;
+       if (count == 5 && strncmp(kernbuf, "clear", 5) == 0) {
+               struct lu_env *env;
+               __u16 refcheck;
+
+               env = cl_env_get(&refcheck);
+               if (!IS_ERR(env)) {
+                       (void)osc_unevict_cache_shrink(env, cli);
+                       /*
+                        * Scan the LRU list, discard the LRU pages or move
+                        * the unevictable/mlock()ed pages into the unevictable
+                        * list.
+                        */
+                       (void)osc_lru_shrink(env, cli,
+                               atomic_long_read(&cli->cl_lru_in_list), true);
+                       cl_env_put(env, &refcheck);
+               }
+               return count;
+       }
+
+       return -EINVAL;
+}
+LPROC_SEQ_FOPS(osc_unevict_cached_mb);
+
 static ssize_t cur_dirty_bytes_show(struct kobject *kobj,
                                    struct attribute *attr,
                                    char *buf)
@@ -711,6 +763,8 @@ struct lprocfs_vars lprocfs_osc_obd_vars[] = {
          .fops =       &osc_obd_max_pages_per_rpc_fops },
        { .name =       "osc_cached_mb",
          .fops =       &osc_cached_mb_fops             },
+       { .name =       "osc_unevict_cached_mb",
+         .fops =       &osc_unevict_cached_mb_fops     },
        { .name =       "cur_grant_bytes",
          .fops =       &osc_cur_grant_bytes_fops       },
        { .name =       "checksum_type",
index ee68cf4..39e366d 100644 (file)
@@ -181,6 +181,8 @@ extern unsigned long osc_cache_shrink_count(struct shrinker *sk,
                                            struct shrink_control *sc);
 extern unsigned long osc_cache_shrink_scan(struct shrinker *sk,
                                           struct shrink_control *sc);
+extern long osc_unevict_cache_shrink(const struct lu_env *env,
+                                    struct client_obd *cli);
 static inline unsigned int osc_max_write_chunks(const struct client_obd *cli)
 {
        /*
index bb8b350..dbe8457 100644 (file)
@@ -423,16 +423,33 @@ void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist)
                cli->cl_lru_last_used = ktime_get_real_seconds();
                spin_unlock(&cli->cl_lru_list_lock);
 
-               if (waitqueue_active(&osc_lru_waitq))
+               if (waitqueue_active(&osc_lru_waitq)) {
                        (void)ptlrpcd_queue_work(cli->cl_lru_work);
+                       CDEBUG(D_CACHE,
+                              "%s: cli %pK add LRU: i%ld/b%ld/u%ld/l%ld/m%ld %ld\n",
+                              cli_name(cli), cli,
+                              atomic_long_read(&cli->cl_lru_in_list),
+                              atomic_long_read(&cli->cl_lru_busy),
+                              atomic_long_read(&cli->cl_unevict_lru_in_list),
+                              atomic_long_read(cli->cl_lru_left),
+                              cli->cl_cache->ccc_lru_max, npages);
+               }
+
        }
 }
 
 static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg)
 {
-       LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0);
+       LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
+
        list_del_init(&opg->ops_lru);
-       atomic_long_dec(&cli->cl_lru_in_list);
+       if (opg->ops_vm_locked) {
+               atomic_long_dec(&cli->cl_unevict_lru_in_list);
+               atomic_long_dec(&cli->cl_cache->ccc_unevict_lru_used);
+               opg->ops_vm_locked = 0;
+       } else {
+               atomic_long_dec(&cli->cl_lru_in_list);
+       }
 }
 
 /**
@@ -442,8 +459,11 @@ static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg)
 static void osc_lru_del(struct client_obd *cli, struct osc_page *opg)
 {
        if (opg->ops_in_lru) {
+               bool mlocked = false;
+
                spin_lock(&cli->cl_lru_list_lock);
                if (!list_empty(&opg->ops_lru)) {
+                       mlocked = opg->ops_vm_locked;
                        __osc_lru_del(cli, opg);
                } else {
                        LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0);
@@ -451,7 +471,8 @@ static void osc_lru_del(struct client_obd *cli, struct osc_page *opg)
                }
                spin_unlock(&cli->cl_lru_list_lock);
 
-               atomic_long_inc(cli->cl_lru_left);
+               if (!mlocked)
+                       atomic_long_inc(cli->cl_lru_left);
                /* this is a great place to release more LRU pages if
                 * this osc occupies too many LRU pages and kernel is
                 * stealing one of them. */
@@ -528,49 +549,169 @@ static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page)
 }
 
 /**
- * Drop @target of pages from LRU at most.
+ * Check whether a page is mlocked and unevictable.
  */
-long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
-                  long target, bool force)
+static inline bool lru_page_unevictable(struct cl_page *clpage)
+{
+       return PageMlocked(cl_page_vmpage(clpage));
+}
+
+enum shrink_action {
+       SK_ACTION_WILL_FREE     = 0,
+       SK_ACTION_OWN_FAIL      = 1,
+       SK_ACTION_UNEVICT_ADD   = 2,
+       SK_ACTION_UNEVICT_DEL   = 3,
+       SK_ACTION_BUSY_SKIP     = 4,
+       SK_ACTION_INVAL         = 6,
+       SK_ACTION_MAX,
+};
+
+static inline bool
+cache_unevict_check_enabled(struct client_obd *cli)
+{
+       return cli->cl_cache->ccc_mlock_pages_enable;
+}
+
+static inline enum shrink_action
+osc_normal_lru_check(const struct lu_env *env, struct client_obd *cli,
+                    struct cl_io *io, struct osc_page *opg)
+{
+       struct cl_page *clpage = opg->ops_cl.cpl_page;
+       enum shrink_action action = SK_ACTION_OWN_FAIL;
+
+       if (cl_page_own_try(env, io, clpage) == 0) {
+               if (cache_unevict_check_enabled(cli) &&
+                   lru_page_unevictable(clpage)) {
+                       opg->ops_vm_locked = 1;
+                       cl_page_disown(env, io, clpage);
+                       list_move_tail(&opg->ops_lru,
+                                      &cli->cl_unevict_lru_list);
+                       return SK_ACTION_UNEVICT_ADD;
+               }
+               if (!lru_page_busy(cli, clpage)) {
+                       /*
+                        * remove it from lru list earlier to avoid
+                        * lock contention.
+                        */
+                       __osc_lru_del(cli, opg);
+                       opg->ops_in_lru = 0; /* will be discarded */
+
+                       cl_page_get(clpage);
+                       return SK_ACTION_WILL_FREE;
+               }
+
+               cl_page_disown(env, io, clpage);
+               action = SK_ACTION_BUSY_SKIP;
+       }
+
+       list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+
+       return action;
+}
+
+static inline enum shrink_action
+osc_unevict_lru_check(const struct lu_env *env, struct client_obd *cli,
+                     struct cl_io *io, struct osc_page *opg)
+{
+       struct cl_page *clpage = opg->ops_cl.cpl_page;
+       enum shrink_action action = SK_ACTION_OWN_FAIL;
+
+       if (cl_page_own_try(env, io, clpage) == 0) {
+               if (!lru_page_busy(cli, clpage) &&
+                   !lru_page_unevictable(clpage)) {
+                       LASSERT(opg->ops_vm_locked == 1);
+                       __osc_lru_del(cli, opg);
+                       opg->ops_in_lru = 0; /* will be discarded */
+
+                       cl_page_get(clpage);
+                       return SK_ACTION_UNEVICT_DEL;
+               }
+
+               cl_page_disown(env, io, clpage);
+               action = SK_ACTION_BUSY_SKIP;
+       }
+
+       list_move_tail(&opg->ops_lru, &cli->cl_unevict_lru_list);
+
+       return action;
+}
+
+/*
+ * Where some shrinker work was initiated.
+ */
+enum sk_reason {
+       SK_REASON_NORMAL_LRU,
+       SK_REASON_UNEVICT_LRU,
+};
+
+static inline enum shrink_action
+osc_lru_page_check(const struct lu_env *env, struct client_obd *cli,
+                  enum sk_reason reason, struct cl_io *io,
+                  struct osc_page *opg)
+{
+       switch (reason) {
+       case SK_REASON_NORMAL_LRU:
+               return osc_normal_lru_check(env, cli, io, opg);
+       case SK_REASON_UNEVICT_LRU:
+               return osc_unevict_lru_check(env, cli, io, opg);
+       default:
+               CERROR("%s: unsupport shrink type: %d\n",
+                      cli_name(cli), reason);
+               LBUG();
+               return SK_ACTION_INVAL;
+       }
+}
+
+static inline int osc_lru_maxscan(enum sk_reason reason, long *target,
+                                 bool force, atomic_long_t *lru_in_list)
+{
+       int maxscan;
+
+       if (force && reason == SK_REASON_UNEVICT_LRU) {
+               maxscan = atomic_long_read(lru_in_list);
+               if (*target == 0)
+                       *target = maxscan;
+       } else {
+               maxscan = min((*target) << 1, atomic_long_read(lru_in_list));
+       }
+
+       return maxscan;
+}
+
+static long osc_lru_list_shrink(const struct lu_env *env,
+                               struct client_obd *cli,
+                               enum sk_reason reason,
+                               struct list_head *lru_list,
+                               atomic_long_t *lru_in_list,
+                               long target, bool force,
+                               long *unevict_delta)
 {
-       struct cl_io *io;
        struct cl_object *clobj = NULL;
        struct cl_page **pvec;
        struct osc_page *opg;
+       struct cl_io *io;
        long count = 0;
-       int maxscan = 0;
        int index = 0;
+       int maxscan;
        int rc = 0;
+       enum shrink_action action;
+       int actnum[SK_ACTION_MAX] = { 0 };
+
        ENTRY;
 
-       LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
-       if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+       LASSERT(atomic_long_read(lru_in_list) >= 0);
+       if (atomic_long_read(lru_in_list) == 0 || target < 0)
                RETURN(0);
 
-       CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n",
-              cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force);
-       if (!force) {
-               if (atomic_read(&cli->cl_lru_shrinkers) > 0)
-                       RETURN(-EBUSY);
-
-               if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) {
-                       atomic_dec(&cli->cl_lru_shrinkers);
-                       RETURN(-EBUSY);
-               }
-       } else {
-               atomic_inc(&cli->cl_lru_shrinkers);
-       }
-
        pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
        io = osc_env_thread_io(env);
 
        spin_lock(&cli->cl_lru_list_lock);
-       if (force)
+       if (force && reason == SK_REASON_NORMAL_LRU)
                cli->cl_lru_reclaim++;
-       maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list));
-       while (!list_empty(&cli->cl_lru_list)) {
+       maxscan = osc_lru_maxscan(reason, &target, force, lru_in_list);
+       while (!list_empty(lru_list)) {
                struct cl_page *page;
-               bool will_free = false;
 
                if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1)
                        break;
@@ -578,11 +719,13 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
                if (--maxscan < 0)
                        break;
 
-               opg = list_first_entry(&cli->cl_lru_list, struct osc_page,
-                                      ops_lru);
+               opg = list_first_entry(lru_list, struct osc_page, ops_lru);
                page = opg->ops_cl.cpl_page;
-               if (lru_page_busy(cli, page)) {
-                       list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+               if (lru_page_busy(cli, page) &&
+                   !(reason == SK_REASON_NORMAL_LRU &&
+                     lru_page_unevictable(page))) {
+                       list_move_tail(&opg->ops_lru, lru_list);
+                       actnum[SK_ACTION_BUSY_SKIP]++;
                        continue;
                }
 
@@ -617,24 +760,22 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
                        continue;
                }
 
-               if (cl_page_own_try(env, io, page) == 0) {
-                       if (!lru_page_busy(cli, page)) {
-                               /* remove it from lru list earlier to avoid
-                                * lock contention */
-                               __osc_lru_del(cli, opg);
-                               opg->ops_in_lru = 0; /* will be discarded */
-
-                               cl_page_get(page);
-                               will_free = true;
-                       } else {
-                               cl_page_disown(env, io, page);
-                       }
-               }
-
-               if (!will_free) {
-                       list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+               action = osc_lru_page_check(env, cli, reason, io, opg);
+               actnum[action]++;
+               if (action == SK_ACTION_UNEVICT_ADD) {
+                       if (unevict_delta)
+                               (*unevict_delta)++;
+                       /*
+                        * The page is moved from the normal LRU list into
+                        * the unevict list.
+                        */
+                       if (++count >= target)
+                               break;
                        continue;
                }
+               if (action != SK_ACTION_WILL_FREE &&
+                   action != SK_ACTION_UNEVICT_DEL)
+                       continue;
 
                /* Don't discard and free the page with cl_lru_list held */
                pvec[index++] = page;
@@ -649,6 +790,20 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
                if (++count >= target)
                        break;
        }
+
+       CDEBUG(D_CACHE, "%s: LRU %s empty %d maxscan %d i%ld/u%ld/b%ld/l%ld actcnt %d/%d/%d/%d/%d count %ld\n",
+              cli_name(cli),
+              reason == SK_REASON_NORMAL_LRU ? "normal" : "unevict",
+              list_empty(lru_list), maxscan,
+              atomic_long_read(&cli->cl_lru_in_list),
+              atomic_long_read(&cli->cl_unevict_lru_in_list),
+              atomic_long_read(&cli->cl_lru_busy),
+              atomic_long_read(cli->cl_lru_left),
+              actnum[SK_ACTION_WILL_FREE],
+              actnum[SK_ACTION_OWN_FAIL],
+              actnum[SK_ACTION_UNEVICT_ADD],
+              actnum[SK_ACTION_UNEVICT_DEL],
+              actnum[SK_ACTION_BUSY_SKIP], count);
        spin_unlock(&cli->cl_lru_list_lock);
 
        if (clobj != NULL) {
@@ -659,12 +814,78 @@ long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
                cond_resched();
        }
 
+       RETURN(count > 0 ? count : rc);
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+                  long target, bool force)
+{
+       struct cl_client_cache *cache = cli->cl_cache;
+       long unevict_delta = 0;
+       long shrank = 0;
+       long count = 0;
+
+       ENTRY;
+
+       LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
+       if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+               RETURN(0);
+
+       CDEBUG(D_CACHE,
+              "%s: shrinkers: %d force: %d target: %ld LRU: i%ld/u%ld/b%ld/l%ld\n",
+              cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force,
+              target, atomic_long_read(&cli->cl_lru_in_list),
+              atomic_long_read(&cli->cl_unevict_lru_in_list),
+              atomic_long_read(&cli->cl_lru_busy),
+              atomic_long_read(cli->cl_lru_left));
+       if (!force) {
+               if (atomic_read(&cli->cl_lru_shrinkers) > 0)
+                       RETURN(-EBUSY);
+
+               if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) {
+                       atomic_dec(&cli->cl_lru_shrinkers);
+                       RETURN(-EBUSY);
+               }
+       } else {
+               atomic_inc(&cli->cl_lru_shrinkers);
+       }
+
+       count = osc_lru_list_shrink(env, cli, SK_REASON_NORMAL_LRU,
+                                   &cli->cl_lru_list, &cli->cl_lru_in_list,
+                                   target, force, &unevict_delta);
+       if (count < 0)
+               GOTO(out, count);
+
+       shrank = count;
+       if (force)
+               GOTO(out, count);
+
+       /*
+        * TODO: In non force mode, should we also scan unevictable list and try
+        * to free some pages that are no longer marked as PG_mlocked here?
+        */
+out:
        atomic_dec(&cli->cl_lru_shrinkers);
-       if (count > 0) {
-               atomic_long_add(count, cli->cl_lru_left);
+       if (unevict_delta > 0) {
+               atomic_long_sub(unevict_delta, &cli->cl_lru_in_list);
+               atomic_long_add(unevict_delta, &cli->cl_unevict_lru_in_list);
+               atomic_long_add(unevict_delta, &cache->ccc_unevict_lru_used);
+       }
+       if (shrank > 0) {
+               atomic_long_add(shrank, cli->cl_lru_left);
+               CDEBUG(D_CACHE,
+                      "%s: LRU shrink %ld i%ld/u%ld/b%ld/l%ld\n",
+                      cli_name(cli), shrank,
+                      atomic_long_read(&cli->cl_lru_in_list),
+                      atomic_long_read(&cli->cl_unevict_lru_in_list),
+                      atomic_long_read(&cli->cl_lru_busy),
+                      atomic_long_read(cli->cl_lru_left));
                wake_up(&osc_lru_waitq);
        }
-       RETURN(count > 0 ? count : rc);
+       RETURN(shrank > 0 ? shrank : count);
 }
 EXPORT_SYMBOL(osc_lru_shrink);
 
@@ -681,7 +902,9 @@ static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
        struct client_obd *scan;
        int max_scans;
        __u16 refcheck;
+       long shrank = 0;
        long rc = 0;
+
        ENTRY;
 
        LASSERT(cache != NULL);
@@ -699,14 +922,20 @@ static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
                       cli_name(cli), rc, npages);
                if (osc_cache_too_much(cli) > 0)
                        ptlrpcd_queue_work(cli->cl_lru_work);
+               shrank = rc;
                GOTO(out, rc);
        } else if (rc > 0) {
+               shrank = rc;
                npages -= rc;
        }
 
-       CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n",
-               cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
-               atomic_long_read(&cli->cl_lru_busy), npages);
+       CDEBUG(D_CACHE,
+              "%s: cli %p no free slots, pages: i%ld/u%ld/b%ld/l%ld/m%ld, want: %ld\n",
+              cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
+              atomic_long_read(&cli->cl_unevict_lru_in_list),
+              atomic_long_read(&cli->cl_lru_busy),
+              atomic_long_read(cli->cl_lru_left),
+              cli->cl_cache->ccc_lru_max, npages);
 
        /* Reclaim LRU slots from other client_obd as it can't free enough
         * from its own. This should rarely happen. */
@@ -721,10 +950,12 @@ static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
               (scan = list_first_entry_or_null(&cache->ccc_lru,
                                                  struct client_obd,
                                                  cl_lru_osc)) != NULL) {
-               CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n",
+               CDEBUG(D_CACHE,
+                      "%s: cli %p LRU pages: %ld, busy: %ld, unevict: %ld.\n",
                       cli_name(scan), scan,
                       atomic_long_read(&scan->cl_lru_in_list),
-                      atomic_long_read(&scan->cl_lru_busy));
+                      atomic_long_read(&scan->cl_lru_busy),
+                      atomic_long_read(&scan->cl_unevict_lru_in_list));
 
                list_move_tail(&scan->cl_lru_osc, &cache->ccc_lru);
                if (osc_cache_too_much(scan) > 0) {
@@ -732,19 +963,25 @@ static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages)
 
                        rc = osc_lru_shrink(env, scan, npages, true);
                        spin_lock(&cache->ccc_lru_lock);
-                       if (rc >= npages)
+                       if (rc >= npages) {
+                               shrank += rc;
                                break;
-                       if (rc > 0)
+                       }
+                       if (rc > 0) {
+                               shrank += rc;
                                npages -= rc;
+                       }
                }
        }
        spin_unlock(&cache->ccc_lru_lock);
 
+       if (shrank > 0)
+               GOTO(out, rc);
 out:
        cl_env_put(env, &refcheck);
-       CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n",
-              cli_name(cli), cli, rc);
-       return rc;
+       CDEBUG(D_CACHE, "%s: cli %p freed %ld/%ld pages.\n",
+              cli_name(cli), cli, rc, shrank);
+       return shrank > 0 ? shrank : rc;
 }
 
 /**
@@ -877,6 +1114,20 @@ void osc_lru_unreserve(struct client_obd *cli, unsigned long npages)
        wake_up(&osc_lru_waitq);
 }
 
+long osc_unevict_cache_shrink(const struct lu_env *env, struct client_obd *cli)
+{
+       long rc;
+
+       ENTRY;
+
+       rc = osc_lru_list_shrink(env, cli, SK_REASON_UNEVICT_LRU,
+                                &cli->cl_unevict_lru_list,
+                                &cli->cl_unevict_lru_in_list,
+                                0, true, NULL);
+
+       RETURN(rc);
+}
+
 /**
  * Atomic operations are expensive. We accumulate the accounting for the
  * same page zone to get better performance.
index 8db5523..180ae77 100644 (file)
@@ -3531,6 +3531,27 @@ int osc_set_info_async(const struct lu_env *env, struct obd_export *exp,
                RETURN(0);
        }
 
+       if (KEY_IS(KEY_UNEVICT_CACHE_SHRINK)) {
+               struct client_obd *cli = &obd->u.cli;
+               long ret;
+
+               ret = osc_unevict_cache_shrink(env, cli);
+               if (ret > 0)
+                       ret = 0;
+
+               /*
+                * Clear unused cache pages and move mlock()ed pages from
+                * the normal LRU list into unevictable LRU list.
+                */
+               ret = osc_lru_shrink(env, cli,
+                                    atomic_long_read(&cli->cl_lru_in_list),
+                                    true);
+               if (ret > 0)
+                       ret = 0;
+
+               RETURN(ret);
+       }
+
        if (!set && !KEY_IS(KEY_GRANT_SHRINK))
                RETURN(-EINVAL);
 
@@ -4002,6 +4023,9 @@ static struct ll_shrinker_ops osc_cache_sh_ops = {
 static int osc_cache_shrink(struct shrinker *shrinker,
                            struct shrink_control *sc)
 {
+       if (!osc_page_cache_shrink_enabled)
+               return 0;
+
        (void)osc_cache_shrink_scan(shrinker, sc);
 
        return osc_cache_shrink_count(shrinker, sc);
index 7df2c24..39db647 100755 (executable)
@@ -31909,6 +31909,262 @@ test_460d() {
 }
 run_test 460d "Check encrypt pools output"
 
+resident_pages() {
+       local file=$1
+
+       vmtouch $file | awk '/Resident Pages:/ {print $3}' |
+               awk -F/ '{ print $1 }'
+}
+
+# The command "echo 2 > /proc/sys/vm/drop_caches" may revoke the DLM locks
+# due to slab cache reclaim. Thus we should avoid to reclaim slab cache for
+# DLM locks during testing since it may evict mlock()ed pages due to the
+# release of the DLM extent lock.
+# After the page cache shrinker is disabled, "echo 3 > /proc/sys/vm/drop_caches"
+# and "echo 2 > /proc/sys/vm/drop_caches" will not scan and clear unused pages
+# from the LRU list.
+disable_page_cache_shrink() {
+       local enabled=$($LCTL get_param -n osc.*.enable_page_cache_shrink |
+                       head -n 1)
+
+       stack_trap "$LCTL set_param osc.*.enable_page_cache_shrink=$enabled"
+       $LCTL set_param osc.*.enable_page_cache_shrink=0
+}
+
+enable_mlock_pages_check() {
+       local enabled=$($LCTL get_param -n llite.*.enable_mlock_pages)
+
+       stack_trap "$LCTL set_param llite.*.enable_mlock_pages=$enabled"
+       $LCTL set_param llite.*.enable_mlock_pages=1
+}
+
+test_600a() {
+       local file=$DIR/$tfile
+       local size_mb=100
+       local pcnt=$((size_mb * 1024 * 1024 / PAGE_SIZE))
+
+       which vmtouch || skip_env "This test needs vmtouch utility"
+       check_set_fallocate_or_skip
+       disable_page_cache_shrink
+       enable_mlock_pages_check
+
+       fallocate -l ${size_mb}M $file || error "failed to fallocate $file"
+       stack_trap "pkill -9 vmtouch || true"
+       vmtouch -vltdw -m 1g $file || error "failed to vmtouch $file"
+
+       local rcnt=$(resident_pages $file)
+
+       echo "before drop_caches (0):"
+       grep Mlocked: /proc/meminfo
+       $LCTL get_param llite.*.max_cached_mb
+       echo "drop page caches (1):"
+       echo 1 > /proc/sys/vm/drop_caches
+       grep Mlocked: /proc/meminfo
+       $LCTL get_param llite.*.max_cached_mb
+       vmtouch $file
+       (( $pcnt == $rcnt )) || error "resident pages are $rcnt, expected $pcnt"
+
+       local unevict_mb
+
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       $LCTL get_param llite.*.unevict_cached_mb
+       unevict_mb=$($LCTL get_param -n llite.*.unevict_cached_mb)
+       (( $unevict_mb == $size_mb )) ||
+               error "unevict_cached_mb is $unevict_mb, expected $size_mb"
+
+       $LCTL set_param $OSC.*$OSC*.osc_unevict_cached_mb=clear
+       $LCTL get_param $OSC.*$OSC*.osc_unevict_cached_mb
+       unevict_mb=$($LCTL get_param -n $OSC.*$OSC*.osc_unevict_cached_mb |
+                    awk '{sum += $1 } END { print sum }')
+       (( $unevict_mb == $size_mb )) ||
+               error "osc_unevict_cached_mb is $unevict_mb, expected $size_mb"
+
+       # The lock revocation will evict the cached pages protected by it.
+       # This is desired behavior for conflict access from the remote client.
+       # But how to deal with the lock revocation triggered by LRU lock
+       # shrinking on client side, should this kind of locks that protected
+       # the mlocked pages be canceled in this case? Or the lock protecting
+       # mlock()ed pages should not put into lock LRU list?
+       cancel_lru_locks $OSC
+       echo "drop lru DLM lock:"
+       grep Mlocked: /proc/meminfo
+       $LCTL get_param llite.*.max_cached_mb
+       $LCTL get_param osc.*.osc_cached_mb
+       rcnt=$(resident_pages $file)
+       (( $rcnt == 0 )) || error "resident pages are $rcnt, expected zero"
+       unevict_mb=$($LCTL get_param -n llite.*.unevict_cached_mb)
+       (( $unevict_mb == 0 )) ||
+               error "unevict_cached_mb is $unevict_mb, expected 0"
+       unevict_mb=$($LCTL get_param -n $OSC.*$OSC*.osc_unevict_cached_mb |
+                    awk '{sum += $1 } END { print sum }')
+       (( $unevict_mb == 0 )) ||
+               error "osc_unevict_cached_mb is $unevict_mb, expected $size_mb"
+
+}
+run_test 600a "basic test for mlock()ed file"
+
+test_600b() {
+       local file=$DIR/$tfile
+       local size_mb=100
+       local cache_limit=64
+       local max_cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                             awk '/^max_cached_mb/ { print $2 }')
+
+       which vmtouch || skip_env "This test needs vmtouch utility"
+       check_set_fallocate_or_skip
+       disable_page_cache_shrink
+       enable_mlock_pages_check
+
+       fallocate -l ${size_mb}M $file || error "failed to fallocate $file"
+       stack_trap "pkill -9 vmtouch || true"
+
+       cancel_lru_locks $OSC
+       $LCTL get_param llite.*.max_cached_mb
+       stack_trap "$LCTL set_param llite.*.max_cached_mb=$max_cached_mb"
+       $LCTL set_param llite.*.max_cached_mb=$cache_limit
+
+       # The required mlock()ed pages (100M) are larger than @max_cached_mb.
+       vmtouch -vltdw -m 1g $file || error "failed to mlock $file"
+       vmtouch $file
+       grep Mlocked: /proc/meminfo
+
+       local used_mb
+       local unevict_mb
+
+       echo 1 > /proc/sys/vm/drop_caches
+       $LCTL get_param llite.*.max_cached_mb
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       used_mb=$($LCTL get_param llite.*.max_cached_mb |
+                 awk '/^used_mb/ { print $2 }')
+       unevict_mb=$($LCTL get_param -n llite.*.unevict_cached_mb)
+       (( $used_mb == 0 )) || error "used_mb is $used_mb, expected 0"
+       (( $unevict_mb == $size_mb )) ||
+               error "unevict_mb is $unevict_mb, expected $size_mb"
+}
+run_test 600b "mlock a file (via vmtouch) larger than max_cached_mb"
+
+test_600c() {
+       local dir=$DIR/$tdir
+       local cache_limit=64
+       local max_cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                             awk '/^max_cached_mb/ { print $2 }')
+
+       which vmtouch || skip_env "This test needs vmtouch utility"
+       check_set_fallocate_or_skip
+       disable_page_cache_shrink
+       enable_mlock_pages_check
+
+       stack_trap "rm -rf $dir"
+       stack_trap "$LCTL set_param llite.*.max_cached_mb=$max_cached_mb"
+       $LCTL set_param llite.*.max_cached_mb=$cache_limit
+       stack_trap "pkill -9 vmtouch || true"
+
+       local size=$((64 * 1048576))
+       local file1=$dir/$tfile.1
+       local file2=$dir/$tfile.2
+
+       mkdir $dir || error "failed to mkdir $dir"
+       fallocate -l $size $file1 || error "failed to fallocate $file1"
+       fallocate -l $size $file2 || error "failed to fallocate $file2"
+       cancel_lru_locks $OSC
+
+       vmtouch -vltdw -m 1g $file1 || error "failed to vmtouch $file1"
+       $LCTL get_param llite.*.max_cached_mb
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       $LCTL get_param llite.*.max_cached_mb
+
+       local cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                         awk '/^used_mb/ { print $2 }')
+
+       [ $cached_mb -eq 0 ] || error "expected used_mb 0 got $cached_mb"
+       cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                   awk '/^unevict_mb/ { print $2 }')
+       [ $cached_mb -eq 64 ] || error "expected unevict_mb 64 got $cached_mb"
+
+       vmtouch -vt $file2 || error "failed to vmtouch $file2"
+       echo 3 > /proc/sys/vm/drop_caches
+       dd if=$file2 of=/dev/null bs=1M count=64 ||
+               error "failed to reading $file2 into cache"
+
+       pkill -9 vmtouch || error "failed to kill vmtouch"
+       vmtouch -vt $file2 || error "failed to load $files into cache"
+       $LCTL get_param llite.*.max_cached_mb
+       echo 1 > /proc/sys/vm/drop_caches
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                   awk '/^used_mb/ { print $2 }')
+       [ $cached_mb -eq 0 ] || error "expected used_mb 0 got $cached_mb"
+       cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                   awk '/^unevict_mb/ { print $2 }')
+       [ $cached_mb -eq 0 ] || error "expected unevict_mb 0 got $cached_mb"
+}
+run_test 600c "Test I/O when mlocked page count > @max_cached_mb"
+
+test_600d_base() {
+       local mlcksz=$1
+       local fsz=$2
+       local n=$3
+       local dir=$DIR/$tdir
+       local mlckf=$dir/mlockfile
+
+       echo "mlock size: $mlcksz file size: $fsz, n: $n"
+       mkdir -p $dir || error "mkdir $dir failed"
+
+       fallocate -l $mlcksz $mlckf || error "failed to fallocate $mlckf"
+       for ((i = 0; i < $n; i++)); do
+               fallocate -l $fsz $dir/$tfile.$i ||
+                       error "failed to fallocate $dir/$tfile.$i"
+       done
+
+       cancel_lru_locks $OSC
+
+       declare -a pids
+
+       vmtouch -vltdw -m 1G $mlckf || error "failed to mlock $mlckf"
+       for ((i = 0; i < $n; i++)); do
+               vmtouch -t -m 1g $dir/$tfile.$i &
+               pids[i]=$!
+       done
+
+       cat /proc/meminfo | grep 'Mlocked'
+       $LCTL get_param llite.*.max_cached_mb
+       echo "drop caches:"
+       echo 1 > /proc/sys/vm/drop_caches
+       $LCTL set_param llite.*.unevict_cached_mb=clear
+       $LCTL get_param llite.*.max_cached_mb
+
+       for ((i = 0; i < $n; i++)); do
+               wait ${pids[i]} || error "touch $dir/$tfile.$i failed: rc = $?"
+       done
+
+       cat /proc/meminfo | grep 'Mlocked:'
+       pkill -9 vmtouch || true
+       rm -rvf $dir || error "failed to rm $dir"
+}
+
+test_600d() {
+       local dir=$DIR/$tdir
+       local cache_limit=64
+       local max_cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+                             awk '/^max_cached_mb/ { print $2 }')
+
+       which vmtouch || skip_env "This test needs vmtouch utility"
+       check_set_fallocate_or_skip
+       disable_page_cache_shrink
+       enable_mlock_pages_check
+
+       stack_trap "rm -rf $dir"
+       stack_trap "$LCTL set_param llite.*.max_cached_mb=$max_cached_mb"
+       $LCTL set_param llite.*.max_cached_mb=$cache_limit
+       stack_trap "pkill -9 vmtouch || true"
+
+       local size=$((cache_limit * 1048576))
+
+       test_600d_base $((size - PAGE_SIZE)) 4096 16
+       test_600d_base $((size - 2 * PAGE_SIZE)) 16384 16
+}
+run_test 600d "Test I/O with limited LRU page slots (some was mlocked)"
+
 prep_801() {
        [[ $MDS1_VERSION -lt $(version_code 2.9.55) ]] ||
        [[ $OST1_VERSION -lt $(version_code 2.9.55) ]] &&