Requires: /usr/sbin/getenforce, acl, /usr/bin/killall, /usr/bin/ping, bc
# Of the supported targets, only rhel7 doesn't support Recommends.
%if 0%{?rhel} > 7 || 0%{?fedora} > 33 || 0%{?rhel} < 1
-Recommends: perl, dbench, iozone
+Recommends: perl, dbench, iozone, vmtouch
# Either of these is sufficient
Suggests: pdsh, clush
%endif
*/
atomic_long_t ccc_lru_left;
/**
+ * # of unevictable LRU entries
+ */
+ atomic_long_t ccc_unevict_lru_used;
+ /**
* List of entities(OSCs) for this LRU cache
*/
struct list_head ccc_lru;
/**
* Set if unstable check is enabled
*/
- unsigned int ccc_unstable_check:1;
+ unsigned int ccc_unstable_check:1,
+ /**
+ * Whether unevictable (mlock pages) checking is enabled
+ */
+ ccc_mlock_pages_enable:1;
/**
* # of unstable pages for this mount point
*/
/**
* If the page is in osc_object::oo_tree.
*/
- ops_intree:1;
+ ops_intree:1,
+ /**
+ * If the page is marked with PG_mlocked.
+ */
+ ops_vm_locked:1;
/**
* lru page list. See osc_lru_{del|use}() in osc_page.c for usage.
*/
atomic_long_t cl_lru_busy;
/** # of LRU pages in the cache for this client_obd */
atomic_long_t cl_lru_in_list;
+ /**
+ * # of LRU pages marked with PG_mlocked in the cache on the client.
+ */
+ atomic_long_t cl_unevict_lru_in_list;
/** # of threads are shrinking LRU cache. To avoid contention, it's not
* allowed to have multiple threads shrinking LRU cache. */
atomic_t cl_lru_shrinkers;
* reclaim is sync, initiated by IO thread when the LRU slots are
* in shortage. */
__u64 cl_lru_reclaim;
+ /** List of unevictable LRU pages for this client_obd */
+ struct list_head cl_unevict_lru_list;
/** List of LRU pages for this client_obd */
struct list_head cl_lru_list;
/** Lock for LRU page list */
#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink"
#define KEY_OSP_CONNECTED "osp_connected"
+#define KEY_UNEVICT_CACHE_SHRINK "unevict_cache_shrink"
+
/* Flags for op_xvalid */
enum op_xvalid {
OP_XVALID_CTIME_SET = BIT(0), /* 0x0001 */
atomic_set(&cli->cl_lru_shrinkers, 0);
atomic_long_set(&cli->cl_lru_busy, 0);
atomic_long_set(&cli->cl_lru_in_list, 0);
+ atomic_long_set(&cli->cl_unevict_lru_in_list, 0);
INIT_LIST_HEAD(&cli->cl_lru_list);
+ INIT_LIST_HEAD(&cli->cl_unevict_lru_list);
spin_lock_init(&cli->cl_lru_list_lock);
atomic_long_set(&cli->cl_unstable_count, 0);
INIT_LIST_HEAD(&cli->cl_shrink_list);
return result;
CDEBUG(D_MMAP|D_IOTRACE,
- "START file %s:"DFID", vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu\n",
+ "START file %s:"DFID", vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu vmf_flags=%#x\n",
file_dentry(vma->vm_file)->d_name.name,
PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid),
- vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff);
+ vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff,
+ vmf->flags);
/* Only SIGKILL and SIGTERM is allowed for fault/nopage/mkwrite
* so that it can be killed by admin but not cause segfault by
}
CDEBUG(D_IOTRACE,
- "COMPLETED: "DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu, rc %d\n",
+ "COMPLETED: "DFID": vma=%p start=%#lx end=%#lx vm_flags=%#lx idx=%lu vmf_flags=%#x: rc=%d\n",
PFID(&ll_i2info(file_inode(vma->vm_file))->lli_fid),
vma, vma->vm_start, vma->vm_end, vma->vm_flags, vmf->pgoff,
- result);
+ vmf->flags, result);
return result;
}
struct ll_ra_info *ra = &sbi->ll_ra_info;
long max_cached_mb;
long unused_mb;
+ long unevict_mb;
mutex_lock(&cache->ccc_max_cache_mb_lock);
max_cached_mb = PAGES_TO_MiB(cache->ccc_lru_max);
unused_mb = PAGES_TO_MiB(atomic_long_read(&cache->ccc_lru_left));
+ unevict_mb = PAGES_TO_MiB(
+ atomic_long_read(&cache->ccc_unevict_lru_used));
mutex_unlock(&cache->ccc_max_cache_mb_lock);
seq_printf(m, "users: %d\n"
"max_cached_mb: %ld\n"
"used_mb: %ld\n"
"unused_mb: %ld\n"
+ "unevict_mb: %ld\n"
"reclaim_count: %u\n"
"max_read_ahead_mb: %lu\n"
"used_read_ahead_mb: %d\n",
max_cached_mb,
max_cached_mb - unused_mb,
unused_mb,
+ unevict_mb,
cache->ccc_lru_shrinkers,
PAGES_TO_MiB(ra->ra_max_pages),
PAGES_TO_MiB(atomic_read(&ra->ra_cur_pages)));
}
LDEBUGFS_SEQ_FOPS(ll_max_cached_mb);
+static int ll_unevict_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct cl_client_cache *cache = sbi->ll_cache;
+ long unevict_mb;
+
+ mutex_lock(&cache->ccc_max_cache_mb_lock);
+ unevict_mb = PAGES_TO_MiB(
+ atomic_long_read(&cache->ccc_unevict_lru_used));
+ mutex_unlock(&cache->ccc_max_cache_mb_lock);
+
+ seq_printf(m, "%ld\n", unevict_mb);
+ return 0;
+}
+
+static ssize_t ll_unevict_cached_mb_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct lu_env *env;
+ __u16 refcheck;
+ char kernbuf[128];
+ int rc;
+
+ ENTRY;
+
+ if (count >= sizeof(kernbuf))
+ RETURN(-EINVAL);
+
+ if (copy_from_user(kernbuf, buffer, count))
+ RETURN(-EFAULT);
+
+ kernbuf[count] = 0;
+ if (count != 5 || strncmp(kernbuf, "clear", 5) != 0)
+ RETURN(-EINVAL);
+
+ env = cl_env_get(&refcheck);
+ if (IS_ERR(env))
+ RETURN(PTR_ERR(env));
+
+ /* being initialized */
+ if (sbi->ll_dt_exp == NULL)
+ GOTO(out, rc = -ENODEV);
+
+ rc = obd_set_info_async(env, sbi->ll_dt_exp,
+ sizeof(KEY_UNEVICT_CACHE_SHRINK),
+ KEY_UNEVICT_CACHE_SHRINK,
+ 0, NULL, NULL);
+out:
+ cl_env_put(env, &refcheck);
+ if (rc >= 0)
+ rc = count;
+
+ RETURN(rc);
+}
+LDEBUGFS_SEQ_FOPS(ll_unevict_cached_mb);
+
+static int ll_enable_mlock_pages_seq_show(struct seq_file *m, void *v)
+{
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct cl_client_cache *cache = sbi->ll_cache;
+
+ seq_printf(m, "%d\n", cache->ccc_mlock_pages_enable);
+ return 0;
+}
+
+static ssize_t ll_enable_mlock_pages_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct super_block *sb = m->private;
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct cl_client_cache *cache = sbi->ll_cache;
+ bool val;
+ int rc;
+
+ rc = kstrtobool_from_user(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ cache->ccc_mlock_pages_enable = val;
+ return count;
+}
+LDEBUGFS_SEQ_FOPS(ll_enable_mlock_pages);
+
static ssize_t pcc_async_threshold_show(struct kobject *kobj,
struct attribute *attr, char *buffer)
{
.fops = &ll_site_stats_fops },
{ .name = "max_cached_mb",
.fops = &ll_max_cached_mb_fops },
+ { .name = "unevict_cached_mb",
+ .fops = &ll_unevict_cached_mb_fops },
+ { .name = "enable_mlock_pages",
+ .fops = &ll_enable_mlock_pages_fops },
{ .name = "statahead_stats",
.fops = &ll_statahead_stats_fops },
{ .name = "unstable_stats",
} else {
unlock_page(vmpage);
result = PTR_ERR(page);
+ CDEBUG(D_CACHE, "failed to alloc page@%pK index%ld: rc = %d\n",
+ vmpage, vmpage->index, result);
}
out:
LASSERT(PageLocked(vmpage));
LASSERT((struct cl_page *)vmpage->private == cp);
+ CDEBUG(D_CACHE, "delete page %pK index %ld\n",
+ vmpage, vmpage->index);
/* Drop the reference count held in vvp_page_init */
refcount_dec(&cp->cp_ref);
refcount_set(&cache->ccc_users, 1);
cache->ccc_lru_max = lru_page_max;
atomic_long_set(&cache->ccc_lru_left, lru_page_max);
+ atomic_long_set(&cache->ccc_unevict_lru_used, 0);
spin_lock_init(&cache->ccc_lru_lock);
INIT_LIST_HEAD(&cache->ccc_lru);
seq_printf(m, "used_mb: %ld\n"
"busy_cnt: %ld\n"
+ "unevict_cnt: %ld\n"
"reclaim: %llu\n",
(atomic_long_read(&cli->cl_lru_in_list) +
atomic_long_read(&cli->cl_lru_busy)) >> shift,
- atomic_long_read(&cli->cl_lru_busy),
+ atomic_long_read(&cli->cl_lru_busy),
+ atomic_long_read(&cli->cl_unevict_lru_in_list),
cli->cl_lru_reclaim);
return 0;
LPROC_SEQ_FOPS(osc_cached_mb);
+static int osc_unevict_cached_mb_seq_show(struct seq_file *m, void *v)
+{
+ struct obd_device *obd = m->private;
+ struct client_obd *cli = &obd->u.cli;
+ int shift = 20 - PAGE_SHIFT;
+
+ seq_printf(m, "%ld\n",
+ atomic_long_read(&cli->cl_unevict_lru_in_list) >> shift);
+ return 0;
+}
+
+static ssize_t osc_unevict_cached_mb_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct obd_device *obd = m->private;
+ struct client_obd *cli = &obd->u.cli;
+ char kernbuf[128];
+
+ if (count >= sizeof(kernbuf))
+ return -EINVAL;
+
+ if (copy_from_user(kernbuf, buffer, count))
+ return -EFAULT;
+
+ kernbuf[count] = 0;
+ if (count == 5 && strncmp(kernbuf, "clear", 5) == 0) {
+ struct lu_env *env;
+ __u16 refcheck;
+
+ env = cl_env_get(&refcheck);
+ if (!IS_ERR(env)) {
+ (void)osc_unevict_cache_shrink(env, cli);
+ /*
+ * Scan the LRU list, discard the LRU pages or move
+ * the unevictable/mlock()ed pages into the unevictable
+ * list.
+ */
+ (void)osc_lru_shrink(env, cli,
+ atomic_long_read(&cli->cl_lru_in_list), true);
+ cl_env_put(env, &refcheck);
+ }
+ return count;
+ }
+
+ return -EINVAL;
+}
+LPROC_SEQ_FOPS(osc_unevict_cached_mb);
+
static ssize_t cur_dirty_bytes_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
.fops = &osc_obd_max_pages_per_rpc_fops },
{ .name = "osc_cached_mb",
.fops = &osc_cached_mb_fops },
+ { .name = "osc_unevict_cached_mb",
+ .fops = &osc_unevict_cached_mb_fops },
{ .name = "cur_grant_bytes",
.fops = &osc_cur_grant_bytes_fops },
{ .name = "checksum_type",
struct shrink_control *sc);
extern unsigned long osc_cache_shrink_scan(struct shrinker *sk,
struct shrink_control *sc);
+extern long osc_unevict_cache_shrink(const struct lu_env *env,
+ struct client_obd *cli);
static inline unsigned int osc_max_write_chunks(const struct client_obd *cli)
{
/*
cli->cl_lru_last_used = ktime_get_real_seconds();
spin_unlock(&cli->cl_lru_list_lock);
- if (waitqueue_active(&osc_lru_waitq))
+ if (waitqueue_active(&osc_lru_waitq)) {
(void)ptlrpcd_queue_work(cli->cl_lru_work);
+ CDEBUG(D_CACHE,
+ "%s: cli %pK add LRU: i%ld/b%ld/u%ld/l%ld/m%ld %ld\n",
+ cli_name(cli), cli,
+ atomic_long_read(&cli->cl_lru_in_list),
+ atomic_long_read(&cli->cl_lru_busy),
+ atomic_long_read(&cli->cl_unevict_lru_in_list),
+ atomic_long_read(cli->cl_lru_left),
+ cli->cl_cache->ccc_lru_max, npages);
+ }
+
}
}
static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg)
{
- LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0);
+ LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
+
list_del_init(&opg->ops_lru);
- atomic_long_dec(&cli->cl_lru_in_list);
+ if (opg->ops_vm_locked) {
+ atomic_long_dec(&cli->cl_unevict_lru_in_list);
+ atomic_long_dec(&cli->cl_cache->ccc_unevict_lru_used);
+ opg->ops_vm_locked = 0;
+ } else {
+ atomic_long_dec(&cli->cl_lru_in_list);
+ }
}
/**
static void osc_lru_del(struct client_obd *cli, struct osc_page *opg)
{
if (opg->ops_in_lru) {
+ bool mlocked = false;
+
spin_lock(&cli->cl_lru_list_lock);
if (!list_empty(&opg->ops_lru)) {
+ mlocked = opg->ops_vm_locked;
__osc_lru_del(cli, opg);
} else {
LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0);
}
spin_unlock(&cli->cl_lru_list_lock);
- atomic_long_inc(cli->cl_lru_left);
+ if (!mlocked)
+ atomic_long_inc(cli->cl_lru_left);
/* this is a great place to release more LRU pages if
* this osc occupies too many LRU pages and kernel is
* stealing one of them. */
}
/**
- * Drop @target of pages from LRU at most.
+ * Check whether a page is mlocked and unevictable.
*/
-long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
- long target, bool force)
+static inline bool lru_page_unevictable(struct cl_page *clpage)
+{
+ return PageMlocked(cl_page_vmpage(clpage));
+}
+
+enum shrink_action {
+ SK_ACTION_WILL_FREE = 0,
+ SK_ACTION_OWN_FAIL = 1,
+ SK_ACTION_UNEVICT_ADD = 2,
+ SK_ACTION_UNEVICT_DEL = 3,
+ SK_ACTION_BUSY_SKIP = 4,
+ SK_ACTION_INVAL = 6,
+ SK_ACTION_MAX,
+};
+
+static inline bool
+cache_unevict_check_enabled(struct client_obd *cli)
+{
+ return cli->cl_cache->ccc_mlock_pages_enable;
+}
+
+static inline enum shrink_action
+osc_normal_lru_check(const struct lu_env *env, struct client_obd *cli,
+ struct cl_io *io, struct osc_page *opg)
+{
+ struct cl_page *clpage = opg->ops_cl.cpl_page;
+ enum shrink_action action = SK_ACTION_OWN_FAIL;
+
+ if (cl_page_own_try(env, io, clpage) == 0) {
+ if (cache_unevict_check_enabled(cli) &&
+ lru_page_unevictable(clpage)) {
+ opg->ops_vm_locked = 1;
+ cl_page_disown(env, io, clpage);
+ list_move_tail(&opg->ops_lru,
+ &cli->cl_unevict_lru_list);
+ return SK_ACTION_UNEVICT_ADD;
+ }
+ if (!lru_page_busy(cli, clpage)) {
+ /*
+ * remove it from lru list earlier to avoid
+ * lock contention.
+ */
+ __osc_lru_del(cli, opg);
+ opg->ops_in_lru = 0; /* will be discarded */
+
+ cl_page_get(clpage);
+ return SK_ACTION_WILL_FREE;
+ }
+
+ cl_page_disown(env, io, clpage);
+ action = SK_ACTION_BUSY_SKIP;
+ }
+
+ list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+
+ return action;
+}
+
+static inline enum shrink_action
+osc_unevict_lru_check(const struct lu_env *env, struct client_obd *cli,
+ struct cl_io *io, struct osc_page *opg)
+{
+ struct cl_page *clpage = opg->ops_cl.cpl_page;
+ enum shrink_action action = SK_ACTION_OWN_FAIL;
+
+ if (cl_page_own_try(env, io, clpage) == 0) {
+ if (!lru_page_busy(cli, clpage) &&
+ !lru_page_unevictable(clpage)) {
+ LASSERT(opg->ops_vm_locked == 1);
+ __osc_lru_del(cli, opg);
+ opg->ops_in_lru = 0; /* will be discarded */
+
+ cl_page_get(clpage);
+ return SK_ACTION_UNEVICT_DEL;
+ }
+
+ cl_page_disown(env, io, clpage);
+ action = SK_ACTION_BUSY_SKIP;
+ }
+
+ list_move_tail(&opg->ops_lru, &cli->cl_unevict_lru_list);
+
+ return action;
+}
+
+/*
+ * Where some shrinker work was initiated.
+ */
+enum sk_reason {
+ SK_REASON_NORMAL_LRU,
+ SK_REASON_UNEVICT_LRU,
+};
+
+static inline enum shrink_action
+osc_lru_page_check(const struct lu_env *env, struct client_obd *cli,
+ enum sk_reason reason, struct cl_io *io,
+ struct osc_page *opg)
+{
+ switch (reason) {
+ case SK_REASON_NORMAL_LRU:
+ return osc_normal_lru_check(env, cli, io, opg);
+ case SK_REASON_UNEVICT_LRU:
+ return osc_unevict_lru_check(env, cli, io, opg);
+ default:
+ CERROR("%s: unsupport shrink type: %d\n",
+ cli_name(cli), reason);
+ LBUG();
+ return SK_ACTION_INVAL;
+ }
+}
+
+static inline int osc_lru_maxscan(enum sk_reason reason, long *target,
+ bool force, atomic_long_t *lru_in_list)
+{
+ int maxscan;
+
+ if (force && reason == SK_REASON_UNEVICT_LRU) {
+ maxscan = atomic_long_read(lru_in_list);
+ if (*target == 0)
+ *target = maxscan;
+ } else {
+ maxscan = min((*target) << 1, atomic_long_read(lru_in_list));
+ }
+
+ return maxscan;
+}
+
+static long osc_lru_list_shrink(const struct lu_env *env,
+ struct client_obd *cli,
+ enum sk_reason reason,
+ struct list_head *lru_list,
+ atomic_long_t *lru_in_list,
+ long target, bool force,
+ long *unevict_delta)
{
- struct cl_io *io;
struct cl_object *clobj = NULL;
struct cl_page **pvec;
struct osc_page *opg;
+ struct cl_io *io;
long count = 0;
- int maxscan = 0;
int index = 0;
+ int maxscan;
int rc = 0;
+ enum shrink_action action;
+ int actnum[SK_ACTION_MAX] = { 0 };
+
ENTRY;
- LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
- if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+ LASSERT(atomic_long_read(lru_in_list) >= 0);
+ if (atomic_long_read(lru_in_list) == 0 || target < 0)
RETURN(0);
- CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n",
- cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force);
- if (!force) {
- if (atomic_read(&cli->cl_lru_shrinkers) > 0)
- RETURN(-EBUSY);
-
- if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) {
- atomic_dec(&cli->cl_lru_shrinkers);
- RETURN(-EBUSY);
- }
- } else {
- atomic_inc(&cli->cl_lru_shrinkers);
- }
-
pvec = (struct cl_page **)osc_env_info(env)->oti_pvec;
io = osc_env_thread_io(env);
spin_lock(&cli->cl_lru_list_lock);
- if (force)
+ if (force && reason == SK_REASON_NORMAL_LRU)
cli->cl_lru_reclaim++;
- maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list));
- while (!list_empty(&cli->cl_lru_list)) {
+ maxscan = osc_lru_maxscan(reason, &target, force, lru_in_list);
+ while (!list_empty(lru_list)) {
struct cl_page *page;
- bool will_free = false;
if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1)
break;
if (--maxscan < 0)
break;
- opg = list_first_entry(&cli->cl_lru_list, struct osc_page,
- ops_lru);
+ opg = list_first_entry(lru_list, struct osc_page, ops_lru);
page = opg->ops_cl.cpl_page;
- if (lru_page_busy(cli, page)) {
- list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+ if (lru_page_busy(cli, page) &&
+ !(reason == SK_REASON_NORMAL_LRU &&
+ lru_page_unevictable(page))) {
+ list_move_tail(&opg->ops_lru, lru_list);
+ actnum[SK_ACTION_BUSY_SKIP]++;
continue;
}
continue;
}
- if (cl_page_own_try(env, io, page) == 0) {
- if (!lru_page_busy(cli, page)) {
- /* remove it from lru list earlier to avoid
- * lock contention */
- __osc_lru_del(cli, opg);
- opg->ops_in_lru = 0; /* will be discarded */
-
- cl_page_get(page);
- will_free = true;
- } else {
- cl_page_disown(env, io, page);
- }
- }
-
- if (!will_free) {
- list_move_tail(&opg->ops_lru, &cli->cl_lru_list);
+ action = osc_lru_page_check(env, cli, reason, io, opg);
+ actnum[action]++;
+ if (action == SK_ACTION_UNEVICT_ADD) {
+ if (unevict_delta)
+ (*unevict_delta)++;
+ /*
+ * The page is moved from the normal LRU list into
+ * the unevict list.
+ */
+ if (++count >= target)
+ break;
continue;
}
+ if (action != SK_ACTION_WILL_FREE &&
+ action != SK_ACTION_UNEVICT_DEL)
+ continue;
/* Don't discard and free the page with cl_lru_list held */
pvec[index++] = page;
if (++count >= target)
break;
}
+
+ CDEBUG(D_CACHE, "%s: LRU %s empty %d maxscan %d i%ld/u%ld/b%ld/l%ld actcnt %d/%d/%d/%d/%d count %ld\n",
+ cli_name(cli),
+ reason == SK_REASON_NORMAL_LRU ? "normal" : "unevict",
+ list_empty(lru_list), maxscan,
+ atomic_long_read(&cli->cl_lru_in_list),
+ atomic_long_read(&cli->cl_unevict_lru_in_list),
+ atomic_long_read(&cli->cl_lru_busy),
+ atomic_long_read(cli->cl_lru_left),
+ actnum[SK_ACTION_WILL_FREE],
+ actnum[SK_ACTION_OWN_FAIL],
+ actnum[SK_ACTION_UNEVICT_ADD],
+ actnum[SK_ACTION_UNEVICT_DEL],
+ actnum[SK_ACTION_BUSY_SKIP], count);
spin_unlock(&cli->cl_lru_list_lock);
if (clobj != NULL) {
cond_resched();
}
+ RETURN(count > 0 ? count : rc);
+}
+
+/**
+ * Drop @target of pages from LRU at most.
+ */
+long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli,
+ long target, bool force)
+{
+ struct cl_client_cache *cache = cli->cl_cache;
+ long unevict_delta = 0;
+ long shrank = 0;
+ long count = 0;
+
+ ENTRY;
+
+ LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0);
+ if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0)
+ RETURN(0);
+
+ CDEBUG(D_CACHE,
+ "%s: shrinkers: %d force: %d target: %ld LRU: i%ld/u%ld/b%ld/l%ld\n",
+ cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force,
+ target, atomic_long_read(&cli->cl_lru_in_list),
+ atomic_long_read(&cli->cl_unevict_lru_in_list),
+ atomic_long_read(&cli->cl_lru_busy),
+ atomic_long_read(cli->cl_lru_left));
+ if (!force) {
+ if (atomic_read(&cli->cl_lru_shrinkers) > 0)
+ RETURN(-EBUSY);
+
+ if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) {
+ atomic_dec(&cli->cl_lru_shrinkers);
+ RETURN(-EBUSY);
+ }
+ } else {
+ atomic_inc(&cli->cl_lru_shrinkers);
+ }
+
+ count = osc_lru_list_shrink(env, cli, SK_REASON_NORMAL_LRU,
+ &cli->cl_lru_list, &cli->cl_lru_in_list,
+ target, force, &unevict_delta);
+ if (count < 0)
+ GOTO(out, count);
+
+ shrank = count;
+ if (force)
+ GOTO(out, count);
+
+ /*
+ * TODO: In non force mode, should we also scan unevictable list and try
+ * to free some pages that are no longer marked as PG_mlocked here?
+ */
+out:
atomic_dec(&cli->cl_lru_shrinkers);
- if (count > 0) {
- atomic_long_add(count, cli->cl_lru_left);
+ if (unevict_delta > 0) {
+ atomic_long_sub(unevict_delta, &cli->cl_lru_in_list);
+ atomic_long_add(unevict_delta, &cli->cl_unevict_lru_in_list);
+ atomic_long_add(unevict_delta, &cache->ccc_unevict_lru_used);
+ }
+ if (shrank > 0) {
+ atomic_long_add(shrank, cli->cl_lru_left);
+ CDEBUG(D_CACHE,
+ "%s: LRU shrink %ld i%ld/u%ld/b%ld/l%ld\n",
+ cli_name(cli), shrank,
+ atomic_long_read(&cli->cl_lru_in_list),
+ atomic_long_read(&cli->cl_unevict_lru_in_list),
+ atomic_long_read(&cli->cl_lru_busy),
+ atomic_long_read(cli->cl_lru_left));
wake_up(&osc_lru_waitq);
}
- RETURN(count > 0 ? count : rc);
+ RETURN(shrank > 0 ? shrank : count);
}
EXPORT_SYMBOL(osc_lru_shrink);
struct client_obd *scan;
int max_scans;
__u16 refcheck;
+ long shrank = 0;
long rc = 0;
+
ENTRY;
LASSERT(cache != NULL);
cli_name(cli), rc, npages);
if (osc_cache_too_much(cli) > 0)
ptlrpcd_queue_work(cli->cl_lru_work);
+ shrank = rc;
GOTO(out, rc);
} else if (rc > 0) {
+ shrank = rc;
npages -= rc;
}
- CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n",
- cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
- atomic_long_read(&cli->cl_lru_busy), npages);
+ CDEBUG(D_CACHE,
+ "%s: cli %p no free slots, pages: i%ld/u%ld/b%ld/l%ld/m%ld, want: %ld\n",
+ cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list),
+ atomic_long_read(&cli->cl_unevict_lru_in_list),
+ atomic_long_read(&cli->cl_lru_busy),
+ atomic_long_read(cli->cl_lru_left),
+ cli->cl_cache->ccc_lru_max, npages);
/* Reclaim LRU slots from other client_obd as it can't free enough
* from its own. This should rarely happen. */
(scan = list_first_entry_or_null(&cache->ccc_lru,
struct client_obd,
cl_lru_osc)) != NULL) {
- CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n",
+ CDEBUG(D_CACHE,
+ "%s: cli %p LRU pages: %ld, busy: %ld, unevict: %ld.\n",
cli_name(scan), scan,
atomic_long_read(&scan->cl_lru_in_list),
- atomic_long_read(&scan->cl_lru_busy));
+ atomic_long_read(&scan->cl_lru_busy),
+ atomic_long_read(&scan->cl_unevict_lru_in_list));
list_move_tail(&scan->cl_lru_osc, &cache->ccc_lru);
if (osc_cache_too_much(scan) > 0) {
rc = osc_lru_shrink(env, scan, npages, true);
spin_lock(&cache->ccc_lru_lock);
- if (rc >= npages)
+ if (rc >= npages) {
+ shrank += rc;
break;
- if (rc > 0)
+ }
+ if (rc > 0) {
+ shrank += rc;
npages -= rc;
+ }
}
}
spin_unlock(&cache->ccc_lru_lock);
+ if (shrank > 0)
+ GOTO(out, rc);
out:
cl_env_put(env, &refcheck);
- CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n",
- cli_name(cli), cli, rc);
- return rc;
+ CDEBUG(D_CACHE, "%s: cli %p freed %ld/%ld pages.\n",
+ cli_name(cli), cli, rc, shrank);
+ return shrank > 0 ? shrank : rc;
}
/**
wake_up(&osc_lru_waitq);
}
+long osc_unevict_cache_shrink(const struct lu_env *env, struct client_obd *cli)
+{
+ long rc;
+
+ ENTRY;
+
+ rc = osc_lru_list_shrink(env, cli, SK_REASON_UNEVICT_LRU,
+ &cli->cl_unevict_lru_list,
+ &cli->cl_unevict_lru_in_list,
+ 0, true, NULL);
+
+ RETURN(rc);
+}
+
/**
* Atomic operations are expensive. We accumulate the accounting for the
* same page zone to get better performance.
RETURN(0);
}
+ if (KEY_IS(KEY_UNEVICT_CACHE_SHRINK)) {
+ struct client_obd *cli = &obd->u.cli;
+ long ret;
+
+ ret = osc_unevict_cache_shrink(env, cli);
+ if (ret > 0)
+ ret = 0;
+
+ /*
+ * Clear unused cache pages and move mlock()ed pages from
+ * the normal LRU list into unevictable LRU list.
+ */
+ ret = osc_lru_shrink(env, cli,
+ atomic_long_read(&cli->cl_lru_in_list),
+ true);
+ if (ret > 0)
+ ret = 0;
+
+ RETURN(ret);
+ }
+
if (!set && !KEY_IS(KEY_GRANT_SHRINK))
RETURN(-EINVAL);
static int osc_cache_shrink(struct shrinker *shrinker,
struct shrink_control *sc)
{
+ if (!osc_page_cache_shrink_enabled)
+ return 0;
+
(void)osc_cache_shrink_scan(shrinker, sc);
return osc_cache_shrink_count(shrinker, sc);
}
run_test 460d "Check encrypt pools output"
+resident_pages() {
+ local file=$1
+
+ vmtouch $file | awk '/Resident Pages:/ {print $3}' |
+ awk -F/ '{ print $1 }'
+}
+
+# The command "echo 2 > /proc/sys/vm/drop_caches" may revoke the DLM locks
+# due to slab cache reclaim. Thus we should avoid to reclaim slab cache for
+# DLM locks during testing since it may evict mlock()ed pages due to the
+# release of the DLM extent lock.
+# After the page cache shrinker is disabled, "echo 3 > /proc/sys/vm/drop_caches"
+# and "echo 2 > /proc/sys/vm/drop_caches" will not scan and clear unused pages
+# from the LRU list.
+disable_page_cache_shrink() {
+ local enabled=$($LCTL get_param -n osc.*.enable_page_cache_shrink |
+ head -n 1)
+
+ stack_trap "$LCTL set_param osc.*.enable_page_cache_shrink=$enabled"
+ $LCTL set_param osc.*.enable_page_cache_shrink=0
+}
+
+enable_mlock_pages_check() {
+ local enabled=$($LCTL get_param -n llite.*.enable_mlock_pages)
+
+ stack_trap "$LCTL set_param llite.*.enable_mlock_pages=$enabled"
+ $LCTL set_param llite.*.enable_mlock_pages=1
+}
+
+test_600a() {
+ local file=$DIR/$tfile
+ local size_mb=100
+ local pcnt=$((size_mb * 1024 * 1024 / PAGE_SIZE))
+
+ which vmtouch || skip_env "This test needs vmtouch utility"
+ check_set_fallocate_or_skip
+ disable_page_cache_shrink
+ enable_mlock_pages_check
+
+ fallocate -l ${size_mb}M $file || error "failed to fallocate $file"
+ stack_trap "pkill -9 vmtouch || true"
+ vmtouch -vltdw -m 1g $file || error "failed to vmtouch $file"
+
+ local rcnt=$(resident_pages $file)
+
+ echo "before drop_caches (0):"
+ grep Mlocked: /proc/meminfo
+ $LCTL get_param llite.*.max_cached_mb
+ echo "drop page caches (1):"
+ echo 1 > /proc/sys/vm/drop_caches
+ grep Mlocked: /proc/meminfo
+ $LCTL get_param llite.*.max_cached_mb
+ vmtouch $file
+ (( $pcnt == $rcnt )) || error "resident pages are $rcnt, expected $pcnt"
+
+ local unevict_mb
+
+ $LCTL set_param llite.*.unevict_cached_mb=clear
+ $LCTL get_param llite.*.unevict_cached_mb
+ unevict_mb=$($LCTL get_param -n llite.*.unevict_cached_mb)
+ (( $unevict_mb == $size_mb )) ||
+ error "unevict_cached_mb is $unevict_mb, expected $size_mb"
+
+ $LCTL set_param $OSC.*$OSC*.osc_unevict_cached_mb=clear
+ $LCTL get_param $OSC.*$OSC*.osc_unevict_cached_mb
+ unevict_mb=$($LCTL get_param -n $OSC.*$OSC*.osc_unevict_cached_mb |
+ awk '{sum += $1 } END { print sum }')
+ (( $unevict_mb == $size_mb )) ||
+ error "osc_unevict_cached_mb is $unevict_mb, expected $size_mb"
+
+ # The lock revocation will evict the cached pages protected by it.
+ # This is desired behavior for conflict access from the remote client.
+ # But how to deal with the lock revocation triggered by LRU lock
+ # shrinking on client side, should this kind of locks that protected
+ # the mlocked pages be canceled in this case? Or the lock protecting
+ # mlock()ed pages should not put into lock LRU list?
+ cancel_lru_locks $OSC
+ echo "drop lru DLM lock:"
+ grep Mlocked: /proc/meminfo
+ $LCTL get_param llite.*.max_cached_mb
+ $LCTL get_param osc.*.osc_cached_mb
+ rcnt=$(resident_pages $file)
+ (( $rcnt == 0 )) || error "resident pages are $rcnt, expected zero"
+ unevict_mb=$($LCTL get_param -n llite.*.unevict_cached_mb)
+ (( $unevict_mb == 0 )) ||
+ error "unevict_cached_mb is $unevict_mb, expected 0"
+ unevict_mb=$($LCTL get_param -n $OSC.*$OSC*.osc_unevict_cached_mb |
+ awk '{sum += $1 } END { print sum }')
+ (( $unevict_mb == 0 )) ||
+ error "osc_unevict_cached_mb is $unevict_mb, expected $size_mb"
+
+}
+run_test 600a "basic test for mlock()ed file"
+
+test_600b() {
+ local file=$DIR/$tfile
+ local size_mb=100
+ local cache_limit=64
+ local max_cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+ awk '/^max_cached_mb/ { print $2 }')
+
+ which vmtouch || skip_env "This test needs vmtouch utility"
+ check_set_fallocate_or_skip
+ disable_page_cache_shrink
+ enable_mlock_pages_check
+
+ fallocate -l ${size_mb}M $file || error "failed to fallocate $file"
+ stack_trap "pkill -9 vmtouch || true"
+
+ cancel_lru_locks $OSC
+ $LCTL get_param llite.*.max_cached_mb
+ stack_trap "$LCTL set_param llite.*.max_cached_mb=$max_cached_mb"
+ $LCTL set_param llite.*.max_cached_mb=$cache_limit
+
+ # The required mlock()ed pages (100M) are larger than @max_cached_mb.
+ vmtouch -vltdw -m 1g $file || error "failed to mlock $file"
+ vmtouch $file
+ grep Mlocked: /proc/meminfo
+
+ local used_mb
+ local unevict_mb
+
+ echo 1 > /proc/sys/vm/drop_caches
+ $LCTL get_param llite.*.max_cached_mb
+ $LCTL set_param llite.*.unevict_cached_mb=clear
+ used_mb=$($LCTL get_param llite.*.max_cached_mb |
+ awk '/^used_mb/ { print $2 }')
+ unevict_mb=$($LCTL get_param -n llite.*.unevict_cached_mb)
+ (( $used_mb == 0 )) || error "used_mb is $used_mb, expected 0"
+ (( $unevict_mb == $size_mb )) ||
+ error "unevict_mb is $unevict_mb, expected $size_mb"
+}
+run_test 600b "mlock a file (via vmtouch) larger than max_cached_mb"
+
+test_600c() {
+ local dir=$DIR/$tdir
+ local cache_limit=64
+ local max_cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+ awk '/^max_cached_mb/ { print $2 }')
+
+ which vmtouch || skip_env "This test needs vmtouch utility"
+ check_set_fallocate_or_skip
+ disable_page_cache_shrink
+ enable_mlock_pages_check
+
+ stack_trap "rm -rf $dir"
+ stack_trap "$LCTL set_param llite.*.max_cached_mb=$max_cached_mb"
+ $LCTL set_param llite.*.max_cached_mb=$cache_limit
+ stack_trap "pkill -9 vmtouch || true"
+
+ local size=$((64 * 1048576))
+ local file1=$dir/$tfile.1
+ local file2=$dir/$tfile.2
+
+ mkdir $dir || error "failed to mkdir $dir"
+ fallocate -l $size $file1 || error "failed to fallocate $file1"
+ fallocate -l $size $file2 || error "failed to fallocate $file2"
+ cancel_lru_locks $OSC
+
+ vmtouch -vltdw -m 1g $file1 || error "failed to vmtouch $file1"
+ $LCTL get_param llite.*.max_cached_mb
+ $LCTL set_param llite.*.unevict_cached_mb=clear
+ $LCTL get_param llite.*.max_cached_mb
+
+ local cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+ awk '/^used_mb/ { print $2 }')
+
+ [ $cached_mb -eq 0 ] || error "expected used_mb 0 got $cached_mb"
+ cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+ awk '/^unevict_mb/ { print $2 }')
+ [ $cached_mb -eq 64 ] || error "expected unevict_mb 64 got $cached_mb"
+
+ vmtouch -vt $file2 || error "failed to vmtouch $file2"
+ echo 3 > /proc/sys/vm/drop_caches
+ dd if=$file2 of=/dev/null bs=1M count=64 ||
+ error "failed to reading $file2 into cache"
+
+ pkill -9 vmtouch || error "failed to kill vmtouch"
+ vmtouch -vt $file2 || error "failed to load $files into cache"
+ $LCTL get_param llite.*.max_cached_mb
+ echo 1 > /proc/sys/vm/drop_caches
+ $LCTL set_param llite.*.unevict_cached_mb=clear
+ cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+ awk '/^used_mb/ { print $2 }')
+ [ $cached_mb -eq 0 ] || error "expected used_mb 0 got $cached_mb"
+ cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+ awk '/^unevict_mb/ { print $2 }')
+ [ $cached_mb -eq 0 ] || error "expected unevict_mb 0 got $cached_mb"
+}
+run_test 600c "Test I/O when mlocked page count > @max_cached_mb"
+
+test_600d_base() {
+ local mlcksz=$1
+ local fsz=$2
+ local n=$3
+ local dir=$DIR/$tdir
+ local mlckf=$dir/mlockfile
+
+ echo "mlock size: $mlcksz file size: $fsz, n: $n"
+ mkdir -p $dir || error "mkdir $dir failed"
+
+ fallocate -l $mlcksz $mlckf || error "failed to fallocate $mlckf"
+ for ((i = 0; i < $n; i++)); do
+ fallocate -l $fsz $dir/$tfile.$i ||
+ error "failed to fallocate $dir/$tfile.$i"
+ done
+
+ cancel_lru_locks $OSC
+
+ declare -a pids
+
+ vmtouch -vltdw -m 1G $mlckf || error "failed to mlock $mlckf"
+ for ((i = 0; i < $n; i++)); do
+ vmtouch -t -m 1g $dir/$tfile.$i &
+ pids[i]=$!
+ done
+
+ cat /proc/meminfo | grep 'Mlocked'
+ $LCTL get_param llite.*.max_cached_mb
+ echo "drop caches:"
+ echo 1 > /proc/sys/vm/drop_caches
+ $LCTL set_param llite.*.unevict_cached_mb=clear
+ $LCTL get_param llite.*.max_cached_mb
+
+ for ((i = 0; i < $n; i++)); do
+ wait ${pids[i]} || error "touch $dir/$tfile.$i failed: rc = $?"
+ done
+
+ cat /proc/meminfo | grep 'Mlocked:'
+ pkill -9 vmtouch || true
+ rm -rvf $dir || error "failed to rm $dir"
+}
+
+test_600d() {
+ local dir=$DIR/$tdir
+ local cache_limit=64
+ local max_cached_mb=$($LCTL get_param llite.*.max_cached_mb |
+ awk '/^max_cached_mb/ { print $2 }')
+
+ which vmtouch || skip_env "This test needs vmtouch utility"
+ check_set_fallocate_or_skip
+ disable_page_cache_shrink
+ enable_mlock_pages_check
+
+ stack_trap "rm -rf $dir"
+ stack_trap "$LCTL set_param llite.*.max_cached_mb=$max_cached_mb"
+ $LCTL set_param llite.*.max_cached_mb=$cache_limit
+ stack_trap "pkill -9 vmtouch || true"
+
+ local size=$((cache_limit * 1048576))
+
+ test_600d_base $((size - PAGE_SIZE)) 4096 16
+ test_600d_base $((size - 2 * PAGE_SIZE)) 16384 16
+}
+run_test 600d "Test I/O with limited LRU page slots (some was mlocked)"
+
prep_801() {
[[ $MDS1_VERSION -lt $(version_code 2.9.55) ]] ||
[[ $OST1_VERSION -lt $(version_code 2.9.55) ]] &&