From: Jinshan Xiong <jinshan.xiong@intel.com>
Date: Thu, 4 Sep 2014 16:56:25 +0000 (-0700)
Subject: LU-7990 clio: revise readahead to support 16MB IO
X-Git-Tag: 2.8.53~9
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=d8467ab8a2ca15fbbd5be3429c9cf9ceb0fa78b8

LU-7990 clio: revise readahead to support 16MB IO

The major work is to adjust read ahead policy to issue
read ahead RPC by underlying RPC size

Test case is backport from Andrew Perepechko's
"LU-7140 llite: too few 4 MiB RPCs from readahead"
http://review.whamcloud.com/16374

Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Signed-off-by: Gu Zheng <gzheng@ddn.com>
Change-Id: Ib25ff37712389cca7fcee86996124e7c9dd4ec9a
Reviewed-on: http://review.whamcloud.com/19368
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Li Xi <lixi@ddn.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---

diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h
index 64bc6bf..9102f2a 100644
--- a/lustre/include/cl_object.h
+++ b/lustre/include/cl_object.h
@@ -1456,7 +1456,9 @@ struct cl_read_ahead {
 	 * This is determined DLM lock coverage, RPC and stripe boundary.
 	 * cra_end is included. */
 	pgoff_t cra_end;
-	/* Release routine. If readahead holds resources underneath, this
+	/* optimal RPC size for this read, by pages */
+	unsigned long cra_rpc_size;
+	/* Release callback. If readahead holds resources underneath, this
 	 * function should be called to release it. */
 	void    (*cra_release)(const struct lu_env *env, void *cbdata);
 	/* Callback data for cra_release routine */
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c
index 50ed4c3..2a92a17 100644
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -397,12 +397,9 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 #endif
 	atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS);
 
-	/* This value may be reduced at connect time in
-	 * ptlrpc_connect_interpret() . We initialize it to only
-	 * 1MB until we know what the performance looks like.
-	 * In the future this should likely be increased. LU-1431 */
-	cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES,
-					  LNET_MTU >> PAGE_CACHE_SHIFT);
+	/* Set it to possible maximum size. It may be reduced by ocd_brw_size
+	 * from OFD after connecting. */
+	cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES;
 
 	/* set cl_chunkbits default value to PAGE_CACHE_SHIFT,
 	 * it will be updated at OSC connection time. */
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index 1ae08db..26b3038 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -334,12 +334,11 @@ static inline struct ll_inode_info *ll_i2info(struct inode *inode)
         return container_of(inode, struct ll_inode_info, lli_vfs_inode);
 }
 
-/* default to about 40meg of readahead on a given system.  That much tied
- * up in 512k readahead requests serviced at 40ms each is about 1GB/s. */
-#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+/* default to about 64M of readahead on a given system. */
+#define SBI_DEFAULT_READAHEAD_MAX	(64UL << (20 - PAGE_CACHE_SHIFT))
 
 /* default to read-ahead full files smaller than 2MB on the second read */
-#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX	(2UL << (20 - PAGE_CACHE_SHIFT))
 
 enum ra_stat {
         RA_STAT_HIT = 0,
@@ -371,17 +370,20 @@ struct ll_ra_info {
  * counted by page index.
  */
 struct ra_io_arg {
-        unsigned long ria_start;  /* start offset of read-ahead*/
-        unsigned long ria_end;    /* end offset of read-ahead*/
-        /* If stride read pattern is detected, ria_stoff means where
-         * stride read is started. Note: for normal read-ahead, the
-         * value here is meaningless, and also it will not be accessed*/
-        pgoff_t ria_stoff;
-        /* ria_length and ria_pages are the length and pages length in the
-         * stride I/O mode. And they will also be used to check whether
-         * it is stride I/O read-ahead in the read-ahead pages*/
-        unsigned long ria_length;
-        unsigned long ria_pages;
+	unsigned long ria_start;  /* start offset of read-ahead*/
+	unsigned long ria_end;    /* end offset of read-ahead*/
+	unsigned long ria_reserved; /* reserved pages for read-ahead */
+	unsigned long ria_end_min;  /* minimum end to cover current read */
+	bool          ria_eof;    /* reach end of file */
+	/* If stride read pattern is detected, ria_stoff means where
+	 * stride read is started. Note: for normal read-ahead, the
+	 * value here is meaningless, and also it will not be accessed*/
+	pgoff_t ria_stoff;
+	/* ria_length and ria_pages are the length and pages length in the
+	 * stride I/O mode. And they will also be used to check whether
+	 * it is stride I/O read-ahead in the read-ahead pages*/
+	unsigned long ria_length;
+	unsigned long ria_pages;
 };
 
 /* LL_HIST_MAX=32 causes an overflow */
@@ -621,6 +623,11 @@ struct ll_readahead_state {
          * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages.
          */
         unsigned long   ras_window_start, ras_window_len;
+	/*
+	 * Optimal RPC size. It decides how many pages will be sent
+	 * for each read-ahead.
+	 */
+	unsigned long	ras_rpc_size;
         /*
          * Where next read-ahead should start at. This lies within read-ahead
          * window. Read-ahead window is read in pieces rather than at once
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c
index b3ce2f5..5297b42 100644
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -98,23 +98,6 @@ static unsigned long ll_ra_count_get(struct ll_sb_info *sbi,
         if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages))
                 GOTO(out, ret = 0);
 
-        /* If the non-strided (ria_pages == 0) readahead window
-         * (ria_start + ret) has grown across an RPC boundary, then trim
-         * readahead size by the amount beyond the RPC so it ends on an
-         * RPC boundary. If the readahead window is already ending on
-         * an RPC boundary (beyond_rpc == 0), or smaller than a full
-         * RPC (beyond_rpc < ret) the readahead size is unchanged.
-         * The (beyond_rpc != 0) check is skipped since the conditional
-         * branch is more expensive than subtracting zero from the result.
-         *
-         * Strided read is left unaligned to avoid small fragments beyond
-         * the RPC boundary from needing an extra read RPC. */
-        if (ria->ria_pages == 0) {
-                long beyond_rpc = (ria->ria_start + ret) % PTLRPC_MAX_BRW_PAGES;
-                if (/* beyond_rpc != 0 && */ beyond_rpc < ret)
-                        ret -= beyond_rpc;
-        }
-
 	if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) {
 		atomic_sub(ret, &ra->ra_cur_pages);
 		ret = 0;
@@ -148,15 +131,16 @@ void ll_ra_stats_inc(struct inode *inode, enum ra_stat which)
 }
 
 #define RAS_CDEBUG(ras) \
-        CDEBUG(D_READA,                                                      \
-               "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
-               "csr %lu sf %lu sp %lu sl %lu \n",                            \
-               ras->ras_last_readpage, ras->ras_consecutive_requests,        \
-               ras->ras_consecutive_pages, ras->ras_window_start,            \
-               ras->ras_window_len, ras->ras_next_readahead,                 \
-               ras->ras_requests, ras->ras_request_index,                    \
-               ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
-               ras->ras_stride_pages, ras->ras_stride_length)
+	CDEBUG(D_READA,                                                      \
+	       "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu rpc %lu "        \
+	       "r %lu ri %lu csr %lu sf %lu sp %lu sl %lu\n",                \
+	       ras->ras_last_readpage, ras->ras_consecutive_requests,        \
+	       ras->ras_consecutive_pages, ras->ras_window_start,            \
+	       ras->ras_window_len, ras->ras_next_readahead,                 \
+	       ras->ras_rpc_size,                                            \
+	       ras->ras_requests, ras->ras_request_index,                    \
+	       ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \
+	       ras->ras_stride_pages, ras->ras_stride_length)
 
 static int index_in_window(unsigned long index, unsigned long point,
                            unsigned long before, unsigned long after)
@@ -263,22 +247,11 @@ out:
         ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
         ria->ria_pages)
 
-/* Limit this to the blocksize instead of PTLRPC_BRW_MAX_SIZE, since we don't
- * know what the actual RPC size is.  If this needs to change, it makes more
- * sense to tune the i_blkbits value for the file based on the OSTs it is
- * striped over, rather than having a constant value for all files here. */
-
-/* RAS_INCREASE_STEP should be (1UL << (inode->i_blkbits - PAGE_CACHE_SHIFT)).
- * Temprarily set RAS_INCREASE_STEP to 1MB. After 4MB RPC is enabled
- * by default, this should be adjusted corresponding with max_read_ahead_mb
- * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
- * up quickly which will affect read performance siginificantly. See LU-2816 */
-#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT)
-
 static inline int stride_io_mode(struct ll_readahead_state *ras)
 {
         return ras->ras_consecutive_stride_requests > 1;
 }
+
 /* The function calculates how much pages will be read in
  * [off, off + length], in such stride IO area,
  * stride_offset = st_off, stride_lengh = st_len,
@@ -344,6 +317,16 @@ static int ria_page_count(struct ra_io_arg *ria)
                                length);
 }
 
+static unsigned long ras_align(struct ll_readahead_state *ras,
+			       unsigned long index,
+			       unsigned long *remainder)
+{
+	unsigned long rem = index % ras->ras_rpc_size;
+	if (remainder != NULL)
+		*remainder = rem;
+	return index - rem;
+}
+
 /*Check whether the index is in the defined ra-window */
 static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
 {
@@ -356,15 +339,15 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
                 ria->ria_length < ria->ria_pages);
 }
 
-static int ll_read_ahead_pages(const struct lu_env *env,
-			       struct cl_io *io, struct cl_page_list *queue,
-			       struct ra_io_arg *ria,
-			       unsigned long *reserved_pages,
-			       pgoff_t *ra_end)
+static unsigned long
+ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page_list *queue, struct ll_readahead_state *ras,
+		    struct ra_io_arg *ria)
 {
 	struct cl_read_ahead ra = { 0 };
-	int rc, count = 0;
+	int rc = 0;
 	bool stride_ria;
+	unsigned long ra_end = 0;
 	pgoff_t page_idx;
 
 	LASSERT(ria != NULL);
@@ -372,26 +355,47 @@ static int ll_read_ahead_pages(const struct lu_env *env,
 
 	stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0;
 	for (page_idx = ria->ria_start;
-	     page_idx <= ria->ria_end && *reserved_pages > 0; page_idx++) {
+	     page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) {
 		if (ras_inside_ra_window(page_idx, ria)) {
 			if (ra.cra_end == 0 || ra.cra_end < page_idx) {
+				unsigned long end;
+
 				cl_read_ahead_release(env, &ra);
 
 				rc = cl_io_read_ahead(env, io, page_idx, &ra);
 				if (rc < 0)
 					break;
 
+				CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n",
+				       page_idx, ra.cra_end, ra.cra_rpc_size);
 				LASSERTF(ra.cra_end >= page_idx,
 					 "object: %p, indcies %lu / %lu\n",
 					 io->ci_obj, ra.cra_end, page_idx);
+				/* update read ahead RPC size.
+				 * NB: it's racy but doesn't matter */
+				if (ras->ras_rpc_size > ra.cra_rpc_size &&
+				    ra.cra_rpc_size > 0)
+					ras->ras_rpc_size = ra.cra_rpc_size;
+				/* trim it to align with optimal RPC size */
+				end = ras_align(ras, ria->ria_end + 1, NULL);
+				if (end > 0 && !ria->ria_eof)
+					ria->ria_end = end - 1;
+				if (ria->ria_end < ria->ria_end_min)
+					ria->ria_end = ria->ria_end_min;
+				if (ria->ria_end > ra.cra_end)
+					ria->ria_end = ra.cra_end;
 			}
+			if (page_idx > ria->ria_end)
+				break;
 
-			/* If the page is inside the read-ahead window*/
+			/* If the page is inside the read-ahead window */
 			rc = ll_read_ahead_page(env, io, queue, page_idx);
-			if (rc == 0) {
-				(*reserved_pages)--;
-				count++;
-			}
+			if (rc < 0)
+				break;
+
+			ra_end = page_idx;
+			if (rc == 0)
+				ria->ria_reserved--;
                 } else if (stride_ria) {
                         /* If it is not in the read-ahead window, and it is
                          * read-ahead mode, then check whether it should skip
@@ -418,8 +422,7 @@ static int ll_read_ahead_pages(const struct lu_env *env,
 
 	cl_read_ahead_release(env, &ra);
 
-	*ra_end = page_idx;
-	return count;
+	return ra_end;
 }
 
 static int ll_readahead(const struct lu_env *env, struct cl_io *io,
@@ -429,7 +432,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	struct vvp_io *vio = vvp_env_io(env);
 	struct ll_thread_info *lti = ll_env_info(env);
 	struct cl_attr *attr = vvp_env_thread_attr(env);
-	unsigned long len, mlen = 0, reserved;
+	unsigned long len, mlen = 0;
 	pgoff_t ra_end, start = 0, end = 0;
 	struct inode *inode;
 	struct ra_io_arg *ria = &lti->lti_ria;
@@ -478,30 +481,17 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 		end = vio->vui_ra_start + vio->vui_ra_count - 1;
 
         if (end != 0) {
-                unsigned long rpc_boundary;
-                /*
-                 * Align RA window to an optimal boundary.
-                 *
-                 * XXX This would be better to align to cl_max_pages_per_rpc
-                 * instead of PTLRPC_MAX_BRW_PAGES, because the RPC size may
-                 * be aligned to the RAID stripe size in the future and that
-                 * is more important than the RPC size.
-                 */
-                /* Note: we only trim the RPC, instead of extending the RPC
-                 * to the boundary, so to avoid reading too much pages during
-                 * random reading. */
-                rpc_boundary = ((end + 1) & (~(PTLRPC_MAX_BRW_PAGES - 1)));
-                if (rpc_boundary > 0)
-                        rpc_boundary--;
-
-                if (rpc_boundary  > start)
-                        end = rpc_boundary;
-
-                /* Truncate RA window to end of file */
-		end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
-
-                ras->ras_next_readahead = max(end, end + 1);
-                RAS_CDEBUG(ras);
+		unsigned long end_index;
+
+		/* Truncate RA window to end of file */
+		end_index = (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT);
+		if (end_index <= end) {
+			end = end_index;
+			ria->ria_eof = true;
+		}
+
+		ras->ras_next_readahead = max(end, end + 1);
+		RAS_CDEBUG(ras);
         }
         ria->ria_start = start;
         ria->ria_end = end;
@@ -533,28 +523,31 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	/* at least to extend the readahead window to cover current read */
 	if (!hit && vio->vui_ra_valid &&
 	    vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) {
+		unsigned long remainder;
+
 		/* to the end of current read window. */
 		mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start;
 		/* trim to RPC boundary */
-		start = ria->ria_start & (PTLRPC_MAX_BRW_PAGES - 1);
-		mlen = min(mlen, PTLRPC_MAX_BRW_PAGES - start);
+		ras_align(ras, ria->ria_start, &remainder);
+		mlen = min(mlen, ras->ras_rpc_size - remainder);
+		ria->ria_end_min = ria->ria_start + mlen;
 	}
 
-	reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen);
-	if (reserved < len)
+	ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen);
+	if (ria->ria_reserved < len)
 		ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT);
 
 	CDEBUG(D_READA, "reserved pages: %lu/%lu/%lu, ra_cur %d, ra_max %lu\n",
-	       reserved, len, mlen,
+	       ria->ria_reserved, len, mlen,
 	       atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages),
 	       ll_i2sbi(inode)->ll_ra_info.ra_max_pages);
 
-	ret = ll_read_ahead_pages(env, io, queue, ria, &reserved, &ra_end);
+	ra_end = ll_read_ahead_pages(env, io, queue, ras, ria);
 
-	if (reserved != 0)
-		ll_ra_count_put(ll_i2sbi(inode), reserved);
+	if (ria->ria_reserved != 0)
+		ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved);
 
-	if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT))
+	if (ra_end == end && ra_end == (kms >> PAGE_CACHE_SHIFT))
 		ll_ra_stats_inc(inode, RA_STAT_EOF);
 
 	/* if we didn't get to the end of the region we reserved from
@@ -565,13 +558,13 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n",
 	       ra_end, end, ria->ria_end, ret);
 
-	if (ra_end != end + 1) {
+	if (ra_end > 0 && ra_end != end) {
 		ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END);
 		spin_lock(&ras->ras_lock);
-		if (ra_end < ras->ras_next_readahead &&
+		if (ra_end <= ras->ras_next_readahead &&
 		    index_in_window(ra_end, ras->ras_window_start, 0,
 				    ras->ras_window_len)) {
-			ras->ras_next_readahead = ra_end;
+			ras->ras_next_readahead = ra_end + 1;
 			RAS_CDEBUG(ras);
 		}
 		spin_unlock(&ras->ras_lock);
@@ -583,7 +576,7 @@ static int ll_readahead(const struct lu_env *env, struct cl_io *io,
 static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras,
 			  unsigned long index)
 {
-	ras->ras_window_start = index & (~(RAS_INCREASE_STEP(inode) - 1));
+	ras->ras_window_start = ras_align(ras, index, NULL);
 }
 
 /* called with the ras_lock held or from places where it doesn't matter */
@@ -612,6 +605,7 @@ static void ras_stride_reset(struct ll_readahead_state *ras)
 void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
 {
 	spin_lock_init(&ras->ras_lock);
+	ras->ras_rpc_size = PTLRPC_MAX_BRW_PAGES;
 	ras_reset(inode, ras, 0);
 	ras->ras_requests = 0;
 }
@@ -721,12 +715,15 @@ static void ras_increase_window(struct inode *inode,
 	 * but current clio architecture does not support retrieve such
 	 * information from lower layer. FIXME later
 	 */
-	if (stride_io_mode(ras))
-		ras_stride_increase_window(ras, ra, RAS_INCREASE_STEP(inode));
-	else
-		ras->ras_window_len = min(ras->ras_window_len +
-					  RAS_INCREASE_STEP(inode),
-					  ra->ra_max_pages_per_file);
+	if (stride_io_mode(ras)) {
+		ras_stride_increase_window(ras, ra, ras->ras_rpc_size);
+	} else {
+		unsigned long wlen;
+
+		wlen = min(ras->ras_window_len + ras->ras_rpc_size,
+			   ra->ra_max_pages_per_file);
+		ras->ras_window_len = ras_align(ras, wlen, NULL);
+	}
 }
 
 static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
@@ -842,6 +839,8 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 		 * of read-ahead, so we use original offset here,
 		 * instead of ras_window_start, which is RPC aligned */
 		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
+		ras->ras_window_start = max(ras->ras_stride_offset,
+					    ras->ras_window_start);
 	} else {
 		if (ras->ras_next_readahead < ras->ras_window_start)
 			ras->ras_next_readahead = ras->ras_window_start;
@@ -868,7 +867,7 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 		 */
 		ras->ras_next_readahead = max(index, ras->ras_next_readahead);
 		ras->ras_stride_offset = index;
-		ras->ras_window_len = RAS_INCREASE_STEP(inode);
+		ras->ras_window_start = max(index, ras->ras_window_start);
 	}
 
 	/* The initial ras_window_len is set to the request size.  To avoid
@@ -1085,38 +1084,39 @@ static int ll_io_read_page(const struct lu_env *env, struct cl_io *io,
 	struct cl_2queue          *queue  = &io->ci_queue;
 	struct vvp_page           *vpg;
 	int			   rc = 0;
+	bool			   uptodate;
 	ENTRY;
 
 	vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page));
+	uptodate = vpg->vpg_defer_uptodate;
+
 	if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
 	    sbi->ll_ra_info.ra_max_pages > 0) {
 		struct vvp_io *vio = vvp_env_io(env);
 		enum ras_update_flags flags = 0;
 
-		if (vpg->vpg_defer_uptodate)
+		if (uptodate)
 			flags |= LL_RAS_HIT;
 		if (!vio->vui_ra_valid)
 			flags |= LL_RAS_MMAP;
 		ras_update(sbi, inode, ras, vvp_index(vpg), flags);
 	}
 
-	if (vpg->vpg_defer_uptodate) {
+	cl_2queue_init(queue);
+	if (uptodate) {
 		vpg->vpg_ra_used = 1;
 		cl_page_export(env, page, 1);
+		cl_page_disown(env, io, page);
+	} else {
+		cl_2queue_add(queue, page);
 	}
 
-	cl_2queue_init(queue);
-	/*
-	 * Add page into the queue even when it is marked uptodate above.
-	 * this will unlock it automatically as part of cl_page_list_disown().
-	 */
-	cl_2queue_add(queue, page);
 	if (sbi->ll_ra_info.ra_max_pages_per_file > 0 &&
 	    sbi->ll_ra_info.ra_max_pages > 0) {
 		int rc2;
 
 		rc2 = ll_readahead(env, io, &queue->c2_qin, ras,
-				   vpg->vpg_defer_uptodate);
+				   uptodate);
 		CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n",
 		       PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg));
 	}
diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c
index 0a2a3dd..f031b65 100644
--- a/lustre/osc/osc_io.c
+++ b/lustre/osc/osc_io.c
@@ -103,6 +103,7 @@ static int osc_io_read_ahead(const struct lu_env *env,
 			ldlm_lock_decref(&lockh, dlmlock->l_req_mode);
 		}
 
+		ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc;
 		ra->cra_end = cl_index(osc2cl(osc),
 				       dlmlock->l_policy_data.l_extent.end);
 		ra->cra_release = osc_read_ahead_release;
@@ -142,7 +143,7 @@ static int osc_io_submit(const struct lu_env *env,
 
 	LASSERT(qin->pl_nr > 0);
 
-	CDEBUG(D_CACHE, "%d %d\n", qin->pl_nr, crt);
+	CDEBUG(D_CACHE|D_READA, "%d %d\n", qin->pl_nr, crt);
 
 	osc = cl2osc(ios->cis_obj);
 	cli = osc_cli(osc);
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index cfe75cc..f52a1cc 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -6596,6 +6596,65 @@ test_101f() {
 }
 run_test 101f "check mmap read performance"
 
+test_101g() {
+	local rpcs
+	local osts=$(get_facets OST)
+	local list=$(comma_list $(osts_nodes))
+	local p="$TMP/$TESTSUITE-$TESTNAME.parameters"
+
+	save_lustre_params $osts "obdfilter.*.brw_size" > $p
+
+	$LFS setstripe -c 1 $DIR/$tfile
+
+	if [ $(lustre_version_code ost1) -ge $(version_code 2.8.52) ]; then
+		set_osd_param $list '' brw_size 16M
+
+		echo "remount client to enable large RPC size"
+		remount_client $MOUNT || error "remount_client failed"
+
+		for mp in $($LCTL get_param -n osc.*.max_pages_per_rpc); do
+			[ "$mp" -eq 4096 ] ||
+				error "max_pages_per_rpc not correctly set"
+		done
+
+		$LCTL set_param -n osc.*.rpc_stats=0
+
+		# 10*16 MiB should be enough for the test
+		dd if=/dev/zero of=$DIR/$tfile bs=16M count=10
+		cancel_lru_locks osc
+		dd of=/dev/null if=$DIR/$tfile bs=16M count=10
+
+		# calculate 16 MiB RPCs
+		rpcs=$($LCTL get_param 'osc.*.rpc_stats' |
+		       sed -n '/pages per rpc/,/^$/p' |
+		       awk 'BEGIN { sum = 0 }; /4096:/ { sum += $2 };
+			    END { print sum }')
+		echo $rpcs RPCs
+		[ "$rpcs" -eq 10 ] || error "not all RPCs are 16 MiB BRW rpcs"
+	fi
+
+	echo "set RPC size to 4MB"
+
+	$LCTL set_param -n osc.*.max_pages_per_rpc=4M osc.*.rpc_stats=0
+	dd if=/dev/zero of=$DIR/$tfile bs=4M count=25
+	cancel_lru_locks osc
+	dd of=/dev/null if=$DIR/$tfile bs=4M count=25
+
+	# calculate 4 MiB RPCs
+	rpcs=$($LCTL get_param 'osc.*.rpc_stats' |
+		sed -n '/pages per rpc/,/^$/p' |
+		awk 'BEGIN { sum = 0 }; /1024:/ { sum += $2 };
+		     END { print sum }')
+	echo $rpcs RPCs
+	[ "$rpcs" -eq 25 ] || error "not all RPCs are 4 MiB BRW rpcs"
+
+	restore_lustre_params < $p
+	remount_client $MOUNT || error "remount_client failed"
+
+	rm -f $p $DIR/$tfile
+}
+run_test 101g "Big bulk(4/16 MiB) readahead"
+
 setup_test102() {
 	test_mkdir -p $DIR/$tdir
 	chown $RUNAS_ID $DIR/$tdir