From 1427a72002e6b57017f1c66eb95f9bebff9ac37f Mon Sep 17 00:00:00 2001
From: Wang Shilong <wshilong@ddn.com>
Date: Sun, 15 Mar 2020 22:06:40 +0800
Subject: [PATCH] LU-12748 readahead: limit async ra requests

Currently async readahead is limited by following factors:

1) @ra_max_pages_per_file
2) @ra_max_read_ahead_whole_pages;
3) @ra_async_pages_per_file_threshold

If admin change a large value 4G to @ra_max_read_ahead_whole_pages,
with 16M RPC we could have 256 async readahead requests
flighting at the same time, this could consume all CPU
resources for readahead without limiting.

Even though we could set @max_active for workqueue,
RA requests still kept in the workqueue pool which help
prevent from CPU busying, the problem is RA still try to
use CPU later, we might still submit too many requests
to workqueue, so instead of limiting it in the workqueue,
we could limit it earlier, if there has been too many
async RA requests in the system(let's say default is 1/2
of CPU cores), we just fallback to sync RA, which limit
read threads using all CPU resources.

Change-Id: I370c04e014f24c795c1a28effca9c51b1db2a417
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Reviewed-on: https://review.whamcloud.com/37927
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/llite/llite_internal.h |  8 ++++----
 lustre/llite/llite_lib.c      | 13 +++++++++++++
 lustre/llite/lproc_llite.c    | 13 +++++++------
 lustre/llite/rw.c             |  9 +++++++--
 lustre/tests/sanity.sh        |  5 ++---
 5 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h
index 5f71f2a..4fc2461 100644
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -483,12 +483,12 @@ struct ll_ra_info {
 	unsigned long	ra_max_read_ahead_whole_pages;
 	struct workqueue_struct  *ll_readahead_wq;
 	/*
-	 * Max number of active works for readahead workqueue,
-	 * default is 0 which make workqueue init number itself,
-	 * unless there is a specific need for throttling the
-	 * number of active work items, specifying '0' is recommended.
+	 * Max number of active works could be triggered
+	 * for async readahead.
 	 */
 	unsigned int ra_async_max_active;
+	/* how many async readahead triggered in flight */
+	atomic_t ra_async_inflight;
 	/* Threshold to control when to trigger async readahead */
 	unsigned long ra_async_pages_per_file_threshold;
 };
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c
index ce55b7f..28aaa2d 100644
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -70,6 +70,17 @@ struct kmem_cache *ll_file_data_slab;
 #define log2(n) ffz(~(n))
 #endif
 
+/**
+ * If there is only one number of core visible to Lustre,
+ * async readahead will be disabled, to avoid massive over
+ * subscription, we use 1/2 of active cores as default max
+ * async readahead requests.
+ */
+static inline unsigned int ll_get_ra_async_max_active(void)
+{
+	return cfs_cpt_weight(cfs_cpt_tab, CFS_CPT_ANY) >> 1;
+}
+
 static struct ll_sb_info *ll_init_sbi(void)
 {
 	struct ll_sb_info *sbi = NULL;
@@ -118,6 +129,8 @@ static struct ll_sb_info *ll_init_sbi(void)
 				sbi->ll_ra_info.ra_max_pages_per_file;
 	sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
 	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1;
+	sbi->ll_ra_info.ra_async_max_active = ll_get_ra_async_max_active();
+	atomic_set(&sbi->ll_ra_info.ra_async_inflight, 0);
 
         sbi->ll_flags |= LL_SBI_VERBOSE;
 #ifdef ENABLE_CHECKSUM
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c
index a82746b..d7d9f97 100644
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -1112,18 +1112,19 @@ static ssize_t max_read_ahead_async_active_store(struct kobject *kobj,
 	if (rc)
 		return rc;
 
-	if (val < 1 || val > WQ_UNBOUND_MAX_ACTIVE) {
-		CERROR("%s: cannot set max_read_ahead_async_active=%u %s than %u\n",
-		       sbi->ll_fsname, val,
-		       val < 1 ? "smaller" : "larger",
-		       val < 1 ? 1 : WQ_UNBOUND_MAX_ACTIVE);
+	/**
+	 * It doesn't make any sense to make it exceed what
+	 * workqueue could acutally support.
+	 */
+	if (val > WQ_UNBOUND_MAX_ACTIVE) {
+		CERROR("%s: cannot set max_read_ahead_async_active=%u larger than %u\n",
+		       sbi->ll_fsname, val, WQ_UNBOUND_MAX_ACTIVE);
 		return -ERANGE;
 	}
 
 	spin_lock(&sbi->ll_lock);
 	sbi->ll_ra_info.ra_async_max_active = val;
 	spin_unlock(&sbi->ll_lock);
-	workqueue_set_max_active(sbi->ll_ra_info.ll_readahead_wq, val);
 
 	return count;
 }
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c
index 89ea7fb..6161bdc 100644
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -528,6 +528,7 @@ static void ll_readahead_handle_work(struct work_struct *wq)
 	__u64 kms;
 	int rc;
 	pgoff_t eof_index;
+	struct ll_sb_info *sbi;
 
 	work = container_of(wq, struct ll_readahead_work,
 			    lrw_readahead_work);
@@ -535,6 +536,7 @@ static void ll_readahead_handle_work(struct work_struct *wq)
 	ras = &fd->fd_ras;
 	file = work->lrw_file;
 	inode = file_inode(file);
+	sbi = ll_i2sbi(inode);
 
 	env = cl_env_alloc(&refcheck, LCT_NOREF);
 	if (IS_ERR(env))
@@ -567,7 +569,7 @@ static void ll_readahead_handle_work(struct work_struct *wq)
 
 	ria->ria_end_idx = work->lrw_end_idx;
 	pages = ria->ria_end_idx - ria->ria_start_idx + 1;
-	ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria,
+	ria->ria_reserved = ll_ra_count_get(sbi, ria,
 					    ria_page_count(ria), pages_min);
 
 	CDEBUG(D_READA,
@@ -631,6 +633,7 @@ out_put_env:
 out_free_work:
 	if (ra_end_idx > 0)
 		ll_ra_stats_inc_sbi(ll_i2sbi(inode), RA_STAT_ASYNC);
+	atomic_dec(&sbi->ll_ra_info.ra_async_inflight);
 	ll_readahead_work_free(work);
 }
 
@@ -1479,7 +1482,8 @@ static int kickoff_async_readahead(struct file *file, unsigned long pages)
 	 * we do async readahead, allowing the user thread to do fast i/o.
 	 */
 	if (stride_io_mode(ras) || !throttle ||
-	    ras->ras_window_pages < throttle)
+	    ras->ras_window_pages < throttle ||
+	    atomic_read(&ra->ra_async_inflight) > ra->ra_async_max_active)
 		return 0;
 
 	if ((atomic_read(&ra->ra_cur_pages) + pages) > ra->ra_max_pages)
@@ -1491,6 +1495,7 @@ static int kickoff_async_readahead(struct file *file, unsigned long pages)
 	/* ll_readahead_work_free() free it */
 	OBD_ALLOC_PTR(lrw);
 	if (lrw) {
+		atomic_inc(&sbi->ll_ra_info.ra_async_inflight);
 		lrw->lrw_file = get_file(file);
 		lrw->lrw_start_idx = start_idx;
 		lrw->lrw_end_idx = end_idx;
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 3f5767f..3c603e2 100755
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -20555,9 +20555,8 @@ test_318() {
 			   llite.*.max_read_ahead_async_active 2>/dev/null)
 	[ $max_active -ne 256 ] && error "expected 256 but got $max_active"
 
-	# currently reset to 0 is unsupported, leave it 512 for now.
-	$LCTL set_param llite.*.max_read_ahead_async_active=0 &&
-		error "set max_read_ahead_async_active should fail"
+	$LCTL set_param llite.*.max_read_ahead_async_active=0 ||
+		error "set max_read_ahead_async_active should succeed"
 
 	$LCTL set_param llite.*.max_read_ahead_async_active=512
 	max_active=$($LCTL get_param -n \
-- 
1.8.3.1