From 0c16987b2233c32d775f0e3e6f6503c4b7825e02 Mon Sep 17 00:00:00 2001
From: Patrick Farrell <paf0187@gmail.com>
Date: Wed, 29 May 2024 16:41:54 -0400
Subject: [PATCH] LU-17872 ldlm: switch to read_positive in reclaim_full

Checking reclaim full for every lock request is expensive;
it requires taking a global spinlock and can completely
clog the MDS CPU on larger systems.

If we switch to read_positive rather than sum_positive for
our counter read, we avoid this spinlock at the cost of
being off by as much as NR_CPU*32.

Since the counter is for hundreds of thousands to millions
of items and just triggers memory reclaim, this level of
error is completely fine.

This resolves the contention issue, on an OCI system with
384 cores, here's our mdtest comparison:

Operation           | Without Patch | With Patch  | %Change
---------------------|---------------|-------------|-------
Directory creation  | 69481.994     | 64373.060   | -7%
Directory stat      | 87942.757     | 274670.454  | 212%
Directory rename    | 78127.922     | 92592.239   | 19%
Directory removal   | 69901.490     | 89560.415   | 28%
File creation       | 62789.774     | 107294.450  | 71%
File stat           | 88039.061     | 480469.711  | 446%
File read           | 82192.370     | 151117.380  | 84%
File removal        | 146690.828    | 127589.655  | -13%
Tree creation       | 46.549        | 56.992      | 22%
Tree removal        | 51.531        | 53.967      | 5%

Note the *446%* improvement in stat and the 70-80% in
file creation and read.

Note this issue is likely much worse on systems with higher
core counts since the cost of summing the counter scales
with the number of CPUs.  This may be why this has not been
seen before.

Signed-off-by: Patrick Farrell <patrick.farrell@oracle.com>
Change-Id: I01a39abf5e6f0829156b413b1f44001e2c504be2
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55141
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: wangdi <di.d.wang@oracle.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/ldlm/ldlm_lockd.c   |  2 +-
 lustre/ldlm/ldlm_reclaim.c | 41 +++++++++++++++++++++++++++++++++--------
 2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c
index 9215f6b..d0fc8dc 100644
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -1368,7 +1368,7 @@ int ldlm_handle_enqueue(struct ldlm_namespace *ns,
 		}
 	} else {
 		if (ldlm_reclaim_full()) {
-			DEBUG_REQ(D_DLMTRACE, req,
+			DEBUG_REQ(D_WARNING | D_RPCTRACE, req,
 				  "Too many granted locks, reject current enqueue request and let the client retry later");
 			GOTO(out, rc = -EINPROGRESS);
 		}
diff --git a/lustre/ldlm/ldlm_reclaim.c b/lustre/ldlm/ldlm_reclaim.c
index cc3750f..88eda6a 100644
--- a/lustre/ldlm/ldlm_reclaim.c
+++ b/lustre/ldlm/ldlm_reclaim.c
@@ -328,20 +328,45 @@ bool ldlm_reclaim_full(void)
 {
 	__u64 high = ldlm_lock_limit;
 	__u64 low = ldlm_reclaim_threshold;
+	bool exact_sum = false;
+	s64 lock_count;
 
-	if (low != 0 && CFS_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW))
+	if (low != 0 && CFS_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW)) {
 		low = cfs_fail_val;
+		exact_sum = true;
+	}
 
-	if (low != 0 &&
-	    percpu_counter_sum_positive(&ldlm_granted_total) > low)
-		ldlm_reclaim_ns();
+	if (low != 0) {
+		/* this takes a spinlock to get precise accuracy, so we only
+		 * do it to get exact behavior for the sanity test
+		 */
+		if (exact_sum)
+			lock_count =
+			    percpu_counter_sum_positive(&ldlm_granted_total);
+
+		else
+			lock_count =
+			    percpu_counter_read_positive(&ldlm_granted_total);
+		if (lock_count > low)
+			ldlm_reclaim_ns();
+	}
 
-	if (high != 0 && CFS_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH))
+	if (high != 0 && CFS_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH)) {
 		high = cfs_fail_val;
+		exact_sum = true;
+	}
 
-	if (high != 0 &&
-	    percpu_counter_sum_positive(&ldlm_granted_total) > high)
-		return true;
+	if (high != 0) {
+		if (exact_sum)
+			lock_count =
+			    percpu_counter_sum_positive(&ldlm_granted_total);
+
+		else
+			lock_count =
+			    percpu_counter_read_positive(&ldlm_granted_total);
+		if (lock_count > high)
+			return true;
+	}
 
 	return false;
 }
-- 
1.8.3.1