From: Wang Shilong <wshilong@ddn.com>
Date: Sun, 28 Jun 2020 08:35:07 +0000 (+0800)
Subject: LU-13669 llite: try to improve mmap performance
X-Git-Tag: 2.13.57~16
X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=0c5ad4b6df5bf35b291842fc6d42c2720246a026;p=fs%2Flustre-release.git

LU-13669 llite: try to improve mmap performance

We have observed slow mmap read performances for some
applications. The problem is if access pattern is neither
sequential nor stride, but could be still adjacent in a
small range and then seek a random position.

So the pattern could be something like this:

[1M data] [hole] [0.5M data] [hole] [0.7M data] [1M data]

Every time an application reads mmap data, it may not only
read a single 4KB page, but aslo a cluster of nearby pages in
a range(e.g. 1MB) of the first page after a cache miss.

The readahead engine is modified to track the range size of
a cluster of mmap reads, so that after a seek and/or cache miss,
the range size is used to efficiently prefetch multiple pages
in a single RPC rather than many small RPCs.

Benchmark:
fio --name=randread --directory=/ai400/fio --rw=randread
--ioengine=mmap --bs=128K --numjobs=32 --filesize=200G
--filename=randread --time_based --status-interval=10s
--runtime=30s --allow_file_create=1 --group_reporting
--disable_lat=1 --disable_clat=1 --disable_slat=1
--disk_util=0 --aux-path=/tmp --randrepeat=0
--unique_filename=0 --fallocate=0

               |   master  |   patched  |  speedup  |
---------------+-----------+------------+-----------+
page_fault_avg |   512usec |    52usec  |  9.75x
page_fault_max |  37698usec|    6543usec|  5.76x

Change-Id: I86436cbd027ec6df5094599e54b4acfd0e018930
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Reviewed-on: https://review.whamcloud.com/38916
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Yingjin Qian <qian@ddn.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---

diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index a0e3036..1a061b0 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -518,6 +518,12 @@ static inline struct pcc_inode *ll_i2pcci(struct inode *inode)
 /* default read-ahead full files smaller than limit on the second read */
 #define SBI_DEFAULT_READ_AHEAD_WHOLE_MAX	MiB_TO_PAGES(2UL)
 
+/* default range pages */
+#define SBI_DEFAULT_RA_RANGE_PAGES		MiB_TO_PAGES(1ULL)
+
+/* Min range pages */
+#define RA_MIN_MMAP_RANGE_PAGES			16UL
+
 enum ra_stat {
         RA_STAT_HIT = 0,
         RA_STAT_MISS,
@@ -534,6 +540,7 @@ enum ra_stat {
 	RA_STAT_FAILED_REACH_END,
 	RA_STAT_ASYNC,
 	RA_STAT_FAILED_FAST_READ,
+	RA_STAT_MMAP_RANGE_READ,
 	_NR_RA_STAT,
 };
 
@@ -541,6 +548,7 @@ struct ll_ra_info {
 	atomic_t	ra_cur_pages;
 	unsigned long	ra_max_pages;
 	unsigned long	ra_max_pages_per_file;
+	unsigned long	ra_range_pages;
 	unsigned long	ra_max_read_ahead_whole_pages;
 	struct workqueue_struct  *ll_readahead_wq;
 	/*
@@ -810,6 +818,16 @@ struct ll_readahead_state {
          */
 	pgoff_t		ras_window_start_idx;
 	pgoff_t		ras_window_pages;
+
+	/* Page index where min range read starts */
+	pgoff_t		ras_range_min_start_idx;
+	/* Page index where mmap range read ends */
+	pgoff_t		ras_range_max_end_idx;
+	/* number of mmap pages where last time detected */
+	pgoff_t		ras_last_range_pages;
+	/* number of mmap range requests */
+	pgoff_t		ras_range_requests;
+
 	/*
 	 * Optimal RPC size in pages.
 	 * It decides how many pages will be sent for each read-ahead.
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c
index 3744aa2..066f6e3 100644
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -136,6 +136,7 @@ static struct ll_sb_info *ll_init_sbi(void)
 		    SBI_DEFAULT_READ_AHEAD_PER_FILE_MAX);
 	sbi->ll_ra_info.ra_async_pages_per_file_threshold =
 				sbi->ll_ra_info.ra_max_pages_per_file;
+	sbi->ll_ra_info.ra_range_pages = SBI_DEFAULT_RA_RANGE_PAGES;
 	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1;
 	atomic_set(&sbi->ll_ra_info.ra_async_inflight, 0);
 
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c
index f6a7699..cf753a5 100644
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -1159,6 +1159,51 @@ read_ahead_async_file_threshold_mb_store(struct kobject *kobj,
 }
 LUSTRE_RW_ATTR(read_ahead_async_file_threshold_mb);
 
+static ssize_t read_ahead_range_kb_show(struct kobject *kobj,
+					struct attribute *attr,char *buf)
+{
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+
+	return snprintf(buf, PAGE_SIZE, "%lu\n",
+			sbi->ll_ra_info.ra_range_pages << (PAGE_SHIFT - 10));
+}
+
+static ssize_t
+read_ahead_range_kb_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer, size_t count)
+{
+	unsigned long pages_number;
+	unsigned long max_ra_per_file;
+	u64 val;
+	struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info,
+					      ll_kset.kobj);
+	int rc;
+
+	rc = sysfs_memparse(buffer, count, &val, "KiB");
+	if (rc < 0)
+		return rc;
+
+	pages_number = val >> PAGE_SHIFT;
+	/* Disable mmap range read */
+	if (pages_number == 0)
+		goto out;
+
+	max_ra_per_file = sbi->ll_ra_info.ra_max_pages_per_file;
+	if (pages_number > max_ra_per_file ||
+	    pages_number < RA_MIN_MMAP_RANGE_PAGES)
+		return -ERANGE;
+
+out:
+	spin_lock(&sbi->ll_lock);
+	sbi->ll_ra_info.ra_range_pages = pages_number;
+	spin_unlock(&sbi->ll_lock);
+
+	return count;
+}
+LUSTRE_RW_ATTR(read_ahead_range_kb);
+
 static ssize_t fast_read_show(struct kobject *kobj,
 			      struct attribute *attr,
 			      char *buf)
@@ -1493,6 +1538,7 @@ static struct attribute *llite_attrs[] = {
 	&lustre_attr_max_read_ahead_whole_mb.attr,
 	&lustre_attr_max_read_ahead_async_active.attr,
 	&lustre_attr_read_ahead_async_file_threshold_mb.attr,
+	&lustre_attr_read_ahead_range_kb.attr,
 	&lustre_attr_stats_track_pid.attr,
 	&lustre_attr_stats_track_ppid.attr,
 	&lustre_attr_stats_track_gid.attr,
@@ -1605,6 +1651,7 @@ static const char *ra_stat_string[] = {
 	[RA_STAT_FAILED_REACH_END] = "failed to reach end",
 	[RA_STAT_ASYNC] = "async readahead",
 	[RA_STAT_FAILED_FAST_READ] = "failed to fast read",
+	[RA_STAT_MMAP_RANGE_READ] = "mmap range read",
 };
 
 int ll_debugfs_register_super(struct super_block *sb, const char *name)
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c
index 0cc784d..daebffa 100644
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -386,7 +386,7 @@ static bool ras_inside_ra_window(pgoff_t idx, struct ra_io_arg *ria)
 static unsigned long
 ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 		    struct cl_page_list *queue, struct ll_readahead_state *ras,
-		    struct ra_io_arg *ria, pgoff_t *ra_end)
+		    struct ra_io_arg *ria, pgoff_t *ra_end, pgoff_t skip_index)
 {
 	struct cl_read_ahead ra = { 0 };
 	/* busy page count is per stride */
@@ -399,6 +399,8 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 	for (page_idx = ria->ria_start_idx;
 	     page_idx <= ria->ria_end_idx && ria->ria_reserved > 0;
 	     page_idx++) {
+		if (skip_index && page_idx == skip_index)
+			continue;
 		if (ras_inside_ra_window(page_idx, ria)) {
 			if (ra.cra_end_idx == 0 || ra.cra_end_idx < page_idx) {
 				pgoff_t end_idx;
@@ -442,10 +444,12 @@ ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
 				if (ras->ras_rpc_pages != ra.cra_rpc_pages &&
 				    ra.cra_rpc_pages > 0)
 					ras->ras_rpc_pages = ra.cra_rpc_pages;
-				/* trim it to align with optimal RPC size */
-				end_idx = ras_align(ras, ria->ria_end_idx + 1);
-				if (end_idx > 0 && !ria->ria_eof)
-					ria->ria_end_idx = end_idx - 1;
+				if (!skip_index) {
+					/* trim it to align with optimal RPC size */
+					end_idx = ras_align(ras, ria->ria_end_idx + 1);
+					if (end_idx > 0 && !ria->ria_eof)
+						ria->ria_end_idx = end_idx - 1;
+				}
 				if (ria->ria_end_idx < ria->ria_end_idx_min)
 					ria->ria_end_idx = ria->ria_end_idx_min;
 			}
@@ -638,7 +642,7 @@ static void ll_readahead_handle_work(struct work_struct *wq)
 	cl_2queue_init(queue);
 
 	rc = ll_read_ahead_pages(env, io, &queue->c2_qin, ras, ria,
-				 &ra_end_idx);
+				 &ra_end_idx, 0);
 	if (ria->ria_reserved != 0)
 		ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
 	if (queue->c2_qin.pl_nr > 0) {
@@ -676,7 +680,7 @@ out_free_work:
 static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 			struct cl_page_list *queue,
 			struct ll_readahead_state *ras, bool hit,
-			struct file *file)
+			struct file *file, pgoff_t skip_index)
 {
 	struct vvp_io *vio = vvp_env_io(env);
 	struct ll_thread_info *lti = ll_env_info(env);
@@ -720,6 +724,9 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	if (ras->ras_window_pages > 0)
 		end_idx = ras->ras_window_start_idx + ras->ras_window_pages - 1;
 
+	if (skip_index)
+		end_idx = start_idx + ras->ras_window_pages - 1;
+
 	/* Enlarge the RA window to encompass the full read */
 	if (vio->vui_ra_valid &&
 	    end_idx < vio->vui_ra_start_idx + vio->vui_ra_pages - 1)
@@ -772,6 +779,10 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 				ria->ria_start_idx;
 	}
 
+	/* don't over reserved for mmap range read */
+	if (skip_index)
+		pages_min = 0;
+
 	ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, pages,
 					    pages_min);
 	if (ria->ria_reserved < pages)
@@ -782,8 +793,8 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
 	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
 
-	ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end_idx);
-
+	ret = ll_read_ahead_pages(env, io, queue, ras, ria, &ra_end_idx,
+				  skip_index);
 	if (ria->ria_reserved != 0)
 		ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
 
@@ -879,6 +890,10 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
 	ras_reset(ras, 0);
 	ras->ras_last_read_end_bytes = 0;
 	ras->ras_requests = 0;
+	ras->ras_range_min_start_idx = 0;
+	ras->ras_range_max_end_idx = 0;
+	ras->ras_range_requests = 0;
+	ras->ras_last_range_pages = 0;
 }
 
 /*
@@ -1027,6 +1042,73 @@ static inline bool is_loose_seq_read(struct ll_readahead_state *ras, loff_t pos)
 			     8UL << PAGE_SHIFT, 8UL << PAGE_SHIFT);
 }
 
+static inline bool is_loose_mmap_read(struct ll_sb_info *sbi,
+				      struct ll_readahead_state *ras,
+				      unsigned long pos)
+{
+	unsigned long range_pages = sbi->ll_ra_info.ra_range_pages;
+
+	return pos_in_window(pos, ras->ras_last_read_end_bytes,
+			     range_pages << PAGE_SHIFT,
+			     range_pages << PAGE_SHIFT);
+}
+
+/**
+ * We have observed slow mmap read performances for some
+ * applications. The problem is if access pattern is neither
+ * sequential nor stride, but could be still adjacent in a
+ * small range and then seek a random position.
+ *
+ * So the pattern could be something like this:
+ *
+ * [1M data] [hole] [0.5M data] [hole] [0.7M data] [1M data]
+ *
+ *
+ * Every time an application reads mmap data, it may not only
+ * read a single 4KB page, but aslo a cluster of nearby pages in
+ * a range(e.g. 1MB) of the first page after a cache miss.
+ *
+ * The readahead engine is modified to track the range size of
+ * a cluster of mmap reads, so that after a seek and/or cache miss,
+ * the range size is used to efficiently prefetch multiple pages
+ * in a single RPC rather than many small RPCs.
+ */
+static void ras_detect_cluster_range(struct ll_readahead_state *ras,
+				     struct ll_sb_info *sbi,
+				     unsigned long pos, unsigned long count)
+{
+	pgoff_t last_pages, pages;
+	pgoff_t end_idx = (pos + count - 1) >> PAGE_SHIFT;
+
+	last_pages = ras->ras_range_max_end_idx -
+			ras->ras_range_min_start_idx + 1;
+	/* First time come here */
+	if (!ras->ras_range_max_end_idx)
+		goto out;
+
+	/* Random or Stride read */
+	if (!is_loose_mmap_read(sbi, ras, pos))
+		goto out;
+
+	ras->ras_range_requests++;
+	if (ras->ras_range_max_end_idx < end_idx)
+		ras->ras_range_max_end_idx = end_idx;
+
+	if (ras->ras_range_min_start_idx > (pos >> PAGE_SHIFT))
+		ras->ras_range_min_start_idx = pos >> PAGE_SHIFT;
+
+	/* Out of range, consider it as random or stride */
+	pages = ras->ras_range_max_end_idx -
+			ras->ras_range_min_start_idx + 1;
+	if (pages <= sbi->ll_ra_info.ra_range_pages)
+		return;
+out:
+	ras->ras_last_range_pages = last_pages;
+	ras->ras_range_requests = 0;
+	ras->ras_range_min_start_idx = pos >> PAGE_SHIFT;
+	ras->ras_range_max_end_idx = end_idx;
+}
+
 static void ras_detect_read_pattern(struct ll_readahead_state *ras,
 				    struct ll_sb_info *sbi,
 				    loff_t pos, size_t count, bool mmap)
@@ -1075,8 +1157,12 @@ static void ras_detect_read_pattern(struct ll_readahead_state *ras,
 	ras->ras_consecutive_bytes += count;
 	if (mmap) {
 		pgoff_t idx = ras->ras_consecutive_bytes >> PAGE_SHIFT;
+		unsigned long ra_range_pages =
+				max_t(unsigned long, RA_MIN_MMAP_RANGE_PAGES,
+				      sbi->ll_ra_info.ra_range_pages);
 
-		if ((idx >= 4 && (idx & 3UL) == 0) || stride_detect)
+		if ((idx >= ra_range_pages &&
+		     idx % ra_range_pages == 0) || stride_detect)
 			ras->ras_need_increase_window = true;
 	} else if ((ras->ras_consecutive_requests > 1 || stride_detect)) {
 		ras->ras_need_increase_window = true;
@@ -1185,10 +1271,36 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 	if (ras->ras_no_miss_check)
 		GOTO(out_unlock, 0);
 
-	if (flags & LL_RAS_MMAP)
+	if (flags & LL_RAS_MMAP) {
+		unsigned long ra_pages;
+
+		ras_detect_cluster_range(ras, sbi, index << PAGE_SHIFT,
+					 PAGE_SIZE);
 		ras_detect_read_pattern(ras, sbi, (loff_t)index << PAGE_SHIFT,
 					PAGE_SIZE, true);
 
+		/* we did not detect anything but we could prefetch */
+		if (!ras->ras_need_increase_window &&
+		    ras->ras_window_pages <= sbi->ll_ra_info.ra_range_pages &&
+		    ras->ras_range_requests >= 2) {
+			if (!hit) {
+				ra_pages = max_t(unsigned long,
+					RA_MIN_MMAP_RANGE_PAGES,
+					ras->ras_last_range_pages);
+				if (index < ra_pages / 2)
+					index = 0;
+				else
+					index -= ra_pages / 2;
+				ras->ras_window_pages = ra_pages;
+				ll_ra_stats_inc_sbi(sbi,
+					RA_STAT_MMAP_RANGE_READ);
+			} else {
+				ras->ras_window_pages = 0;
+			}
+			goto skip;
+		}
+	}
+
 	if (!hit && ras->ras_window_pages &&
 	    index < ras->ras_next_readahead_idx &&
 	    pos_in_window(index, ras->ras_window_start_idx, 0,
@@ -1227,6 +1339,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 			GOTO(out_unlock, 0);
 		}
 	}
+
+skip:
 	ras_set_start(ras, index);
 
 	if (stride_io_mode(ras)) {
@@ -1495,8 +1609,12 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 	io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos +
 				io->u.ci_rw.crw_count - 1);
 	if (ll_readahead_enabled(sbi) && ras) {
+		pgoff_t skip_index = 0;
+
+		if (ras->ras_next_readahead_idx < vvp_index(vpg))
+			skip_index = vvp_index(vpg);
 		rc2 = ll_readahead(env, io, &queue->c2_qin, ras,
-				   uptodate, file);
+				   uptodate, file, skip_index);
 		CDEBUG(D_READA, DFID " %d pages read ahead at %lu\n",
 		       PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
 	} else if (vvp_index(vpg) == io_start_index &&