From 0c16987b2233c32d775f0e3e6f6503c4b7825e02 Mon Sep 17 00:00:00 2001 From: Patrick Farrell Date: Wed, 29 May 2024 16:41:54 -0400 Subject: [PATCH] LU-17872 ldlm: switch to read_positive in reclaim_full Checking reclaim full for every lock request is expensive; it requires taking a global spinlock and can completely clog the MDS CPU on larger systems. If we switch to read_positive rather than sum_positive for our counter read, we avoid this spinlock at the cost of being off by as much as NR_CPU*32. Since the counter is for hundreds of thousands to millions of items and just triggers memory reclaim, this level of error is completely fine. This resolves the contention issue, on an OCI system with 384 cores, here's our mdtest comparison: Operation | Without Patch | With Patch | %Change ---------------------|---------------|-------------|------- Directory creation | 69481.994 | 64373.060 | -7% Directory stat | 87942.757 | 274670.454 | 212% Directory rename | 78127.922 | 92592.239 | 19% Directory removal | 69901.490 | 89560.415 | 28% File creation | 62789.774 | 107294.450 | 71% File stat | 88039.061 | 480469.711 | 446% File read | 82192.370 | 151117.380 | 84% File removal | 146690.828 | 127589.655 | -13% Tree creation | 46.549 | 56.992 | 22% Tree removal | 51.531 | 53.967 | 5% Note the *446%* improvement in stat and the 70-80% in file creation and read. Note this issue is likely much worse on systems with higher core counts since the cost of summing the counter scales with the number of CPUs. This may be why this has not been seen before. Signed-off-by: Patrick Farrell Change-Id: I01a39abf5e6f0829156b413b1f44001e2c504be2 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55141 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: wangdi Reviewed-by: Oleg Drokin --- lustre/ldlm/ldlm_lockd.c | 2 +- lustre/ldlm/ldlm_reclaim.c | 41 +++++++++++++++++++++++++++++++++-------- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 9215f6b..d0fc8dc 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1368,7 +1368,7 @@ int ldlm_handle_enqueue(struct ldlm_namespace *ns, } } else { if (ldlm_reclaim_full()) { - DEBUG_REQ(D_DLMTRACE, req, + DEBUG_REQ(D_WARNING | D_RPCTRACE, req, "Too many granted locks, reject current enqueue request and let the client retry later"); GOTO(out, rc = -EINPROGRESS); } diff --git a/lustre/ldlm/ldlm_reclaim.c b/lustre/ldlm/ldlm_reclaim.c index cc3750f..88eda6a 100644 --- a/lustre/ldlm/ldlm_reclaim.c +++ b/lustre/ldlm/ldlm_reclaim.c @@ -328,20 +328,45 @@ bool ldlm_reclaim_full(void) { __u64 high = ldlm_lock_limit; __u64 low = ldlm_reclaim_threshold; + bool exact_sum = false; + s64 lock_count; - if (low != 0 && CFS_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW)) + if (low != 0 && CFS_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_LOW)) { low = cfs_fail_val; + exact_sum = true; + } - if (low != 0 && - percpu_counter_sum_positive(&ldlm_granted_total) > low) - ldlm_reclaim_ns(); + if (low != 0) { + /* this takes a spinlock to get precise accuracy, so we only + * do it to get exact behavior for the sanity test + */ + if (exact_sum) + lock_count = + percpu_counter_sum_positive(&ldlm_granted_total); + + else + lock_count = + percpu_counter_read_positive(&ldlm_granted_total); + if (lock_count > low) + ldlm_reclaim_ns(); + } - if (high != 0 && CFS_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH)) + if (high != 0 && CFS_FAIL_CHECK(OBD_FAIL_LDLM_WATERMARK_HIGH)) { high = cfs_fail_val; + exact_sum = true; + } - if (high != 0 && - percpu_counter_sum_positive(&ldlm_granted_total) > high) - return true; + if (high != 0) { + if (exact_sum) + lock_count = + percpu_counter_sum_positive(&ldlm_granted_total); + + else + lock_count = + percpu_counter_read_positive(&ldlm_granted_total); + if (lock_count > high) + return true; + } return false; } -- 1.8.3.1