From 21ccfd99de46e1ca1572b16db0d3cef9cd805f4f Mon Sep 17 00:00:00 2001
From: James Simmons <jsimmons@infradead.org>
Date: Tue, 24 Mar 2020 22:47:00 -0400
Subject: [PATCH] LU-13258 llite: bind readahead workqueue to CPT set

A workqueue is used by Lustre to optimize readahead. This work
queue can run on any core and can easily be over surscribed. This
will have a negative impact on HPC applications running on a
Lustre client. Limit the number of threads a workqueue can run
to the size of the CPU allocated for Lustre and only allow those
threads to run on the cores belonging to the CPT set.

Change-Id: Ifcc662d52843f5028c34d55695c1d6297e5c00b0
Signed-off-by: James Simmons <jsimmons@infradead.org>
Reviewed-on: https://review.whamcloud.com/37717
Reviewed-by: Wang Shilong <wshilong@ddn.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Shaun Tancheff <shaun.tancheff@hpe.com>
Reviewed-by: Stephen Champion <stephen.champion@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 libcfs/autoconf/lustre-libcfs.m4        | 17 +++++++++++++++++
 libcfs/include/libcfs/linux/linux-cpu.h |  7 +++++++
 lustre/llite/llite_lib.c                | 19 +++++++++++++++++--
 lustre/llite/lproc_llite.c              |  4 +++-
 4 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/libcfs/autoconf/lustre-libcfs.m4 b/libcfs/autoconf/lustre-libcfs.m4
index 0c384d41..a46d3f8 100644
--- a/libcfs/autoconf/lustre-libcfs.m4
+++ b/libcfs/autoconf/lustre-libcfs.m4
@@ -786,6 +786,22 @@ rhashtable_lookup_get_insert_fast, [
 ]) # LIBCFS_RHASHTABLE_LOOKUP_GET_INSERT_FAST
 
 #
+# Kernel version 4.12-rc2 8f553c498e1772cccb39a114da4a498d22992758
+# provide proper CPU hotplug locking
+#
+AC_DEFUN([LIBCFS_CPUS_READ_LOCK], [
+LB_CHECK_COMPILE([if 'cpus_read_[un]lock' exist],
+cpu_read_lock, [
+	#include <linux/cpu.h>
+],[
+	cpus_read_lock();
+	cpus_read_unlock();
+],[
+	AC_DEFINE(HAVE_CPUS_READ_LOCK, 1, ['cpu_read_lock' exist])
+])
+]) # LIBCFS_CPUS_READ_LOCK
+
+#
 # Kernel version 4.12-rc3 f9727a17db9bab71ddae91f74f11a8a2f9a0ece6
 # renamed uuid_be to uuid_t
 #
@@ -1219,6 +1235,7 @@ LIBCFS_RHT_BUCKET_VAR
 LIBCFS_HAVE_PROCESSOR_HEADER
 LIBCFS_HAVE_WAIT_BIT_HEADER
 LIBCFS_WAIT_QUEUE_TASK_LIST_RENAME
+LIBCFS_CPUS_READ_LOCK
 LIBCFS_UUID_T
 # 4.13
 LIBCFS_WAIT_QUEUE_ENTRY
diff --git a/libcfs/include/libcfs/linux/linux-cpu.h b/libcfs/include/libcfs/linux/linux-cpu.h
index ab6b55e..8353f69 100644
--- a/libcfs/include/libcfs/linux/linux-cpu.h
+++ b/libcfs/include/libcfs/linux/linux-cpu.h
@@ -39,8 +39,15 @@
 #ifndef __LIBCFS_LINUX_CPU_H__
 #define __LIBCFS_LINUX_CPU_H__
 
+#include <linux/cpu.h>
+
 #ifndef HAVE_TOPOLOGY_SIBLING_CPUMASK
 # define topology_sibling_cpumask(cpu)	topology_thread_cpumask(cpu)
 #endif /* HAVE_TOPOLOGY_SIBLING_CPUMASK */
 
+#ifndef HAVE_CPUS_READ_LOCK
+# define cpus_read_lock		get_online_cpus
+# define cpus_read_unlock	put_online_cpus
+#endif
+
 #endif /* __LIBCFS_LINUX_CPU_H__ */
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c
index 9c4983b..5a56084 100644
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -36,6 +36,7 @@
 
 #define DEBUG_SUBSYSTEM S_LLITE
 
+#include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/statfs.h>
@@ -49,6 +50,9 @@
 #include <linux/uidgid.h>
 #include <linux/security.h>
 
+#ifndef HAVE_CPUS_READ_LOCK
+#include <libcfs/linux/linux-cpu.h>
+#endif
 #include <uapi/linux/lustre/lustre_ioctl.h>
 #ifdef HAVE_UAPI_LINUX_MOUNT_H
 #include <uapi/linux/mount.h>
@@ -83,6 +87,8 @@ static inline unsigned int ll_get_ra_async_max_active(void)
 
 static struct ll_sb_info *ll_init_sbi(void)
 {
+	struct workqueue_attrs attrs = { };
+	cpumask_var_t *mask;
 	struct ll_sb_info *sbi = NULL;
 	unsigned long pages;
 	unsigned long lru_page_max;
@@ -111,13 +117,23 @@ static struct ll_sb_info *ll_init_sbi(void)
         pages = si.totalram - si.totalhigh;
 	lru_page_max = pages / 2;
 
-	sbi->ll_ra_info.ra_async_max_active = 0;
+	sbi->ll_ra_info.ra_async_max_active = ll_get_ra_async_max_active();
 	sbi->ll_ra_info.ll_readahead_wq =
 		alloc_workqueue("ll-readahead-wq", WQ_UNBOUND,
 				sbi->ll_ra_info.ra_async_max_active);
 	if (!sbi->ll_ra_info.ll_readahead_wq)
 		GOTO(out_pcc, rc = -ENOMEM);
 
+	mask = cfs_cpt_cpumask(cfs_cpt_tab, CFS_CPT_ANY);
+	if (mask && alloc_cpumask_var(&attrs.cpumask, GFP_KERNEL)) {
+		cpumask_copy(attrs.cpumask, *mask);
+		cpus_read_lock();
+		cfs_apply_workqueue_attrs(sbi->ll_ra_info.ll_readahead_wq,
+					  &attrs);
+		cpus_read_unlock();
+		free_cpumask_var(attrs.cpumask);
+	}
+
 	/* initialize ll_cache data */
 	sbi->ll_cache = cl_cache_init(lru_page_max);
 	if (sbi->ll_cache == NULL)
@@ -129,7 +145,6 @@ static struct ll_sb_info *ll_init_sbi(void)
 				sbi->ll_ra_info.ra_max_pages_per_file;
 	sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file;
 	sbi->ll_ra_info.ra_max_read_ahead_whole_pages = -1;
-	sbi->ll_ra_info.ra_async_max_active = ll_get_ra_async_max_active();
 	atomic_set(&sbi->ll_ra_info.ra_async_inflight, 0);
 
         sbi->ll_flags |= LL_SBI_VERBOSE;
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c
index d7d9f97..3e5310d 100644
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -1114,7 +1114,9 @@ static ssize_t max_read_ahead_async_active_store(struct kobject *kobj,
 
 	/**
 	 * It doesn't make any sense to make it exceed what
-	 * workqueue could acutally support.
+	 * workqueue could acutally support. This can easily
+	 * over subscripe the cores but Lustre internally
+	 * throttles to avoid those impacts.
 	 */
 	if (val > WQ_UNBOUND_MAX_ACTIVE) {
 		CERROR("%s: cannot set max_read_ahead_async_active=%u larger than %u\n",
-- 
1.8.3.1