From 2d59294d52b696125acc464e5910c893d9aef237 Mon Sep 17 00:00:00 2001
From: Patrick Farrell <pfarrell@whamcloud.com>
Date: Wed, 4 May 2022 20:50:57 -0400
Subject: [PATCH] LU-15821 ldlm: Prioritize blocking callbacks

The current code places bl_ast lock callbacks at the end of
the global BL callback queue.  This is bad because it
causes urgent requests from the server to wait behind
non-urgent cleanup tasks to keep lru_size at the right
level.

This can lead to evictions if there is a large queue of
items in the global queue so the callback is not serviced
in a timely manner.

Put bl_ast callbacks on the priority queue so they do not
wait behind the background traffic.

Add some additional debug in this area.

Signed-off-by: Patrick Farrell <pfarrell@whamcloud.com>
Change-Id: Ic6eb65819a4a93e9d30e807d386ca18380b30c7d
Reviewed-on: https://review.whamcloud.com/47215
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/ldlm/ldlm_lockd.c | 60 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c
index f82df7d..7e59709 100644
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -83,27 +83,29 @@ static inline timeout_t ldlm_get_rq_timeout(void)
 }
 
 struct ldlm_bl_pool {
-	spinlock_t blp_lock;
+	spinlock_t		blp_lock;
 
 	/*
 	 * blp_prio_list is used for callbacks that should be handled
 	 * as a priority. It is used for LDLM_FL_DISCARD_DATA requests.
 	 * see b=13843
 	 */
-	struct list_head blp_prio_list;
+	struct list_head	blp_prio_list;
 
 	/*
 	 * blp_list is used for all other callbacks which are likely
 	 * to take longer to process.
 	 */
-	struct list_head blp_list;
-
-	wait_queue_head_t blp_waitq;
-	struct completion blp_comp;
-	atomic_t blp_num_threads;
-	atomic_t blp_busy_threads;
-	int blp_min_threads;
-	int blp_max_threads;
+	struct list_head	blp_list;
+
+	wait_queue_head_t	blp_waitq;
+	struct completion	blp_comp;
+	atomic_t		blp_num_threads;
+	atomic_t		blp_busy_threads;
+	int			blp_min_threads;
+	int			blp_max_threads;
+	int			blp_total_locks;
+	int			blp_total_blwis;
 };
 
 struct ldlm_bl_work_item {
@@ -2116,22 +2118,41 @@ static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi,
 			       enum ldlm_cancel_flags cancel_flags)
 {
 	struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool;
+	char *prio = "regular";
+	int count;
 
 	ENTRY;
 
 	spin_lock(&blp->blp_lock);
+	/* cannot access blwi after added to list and lock is dropped */
+	count = blwi->blwi_lock ? 1 : blwi->blwi_count;
+
+	/* if the server is waiting on a lock to be cancelled (bl_ast), this is
+	 * an urgent request and should go in the priority queue so it doesn't
+	 * get stuck behind non-priority work (eg, lru size management)
+	 *
+	 * We also prioritize discard_data, which is for eviction handling
+	 */
 	if (blwi->blwi_lock &&
-	    ldlm_is_discard_data(blwi->blwi_lock)) {
-		/* add LDLM_FL_DISCARD_DATA requests to the priority list */
+	    (ldlm_is_discard_data(blwi->blwi_lock) ||
+	     ldlm_is_bl_ast(blwi->blwi_lock))) {
 		list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list);
+		prio = "priority";
 	} else {
 		/* other blocking callbacks are added to the regular list */
 		list_add_tail(&blwi->blwi_entry, &blp->blp_list);
 	}
+	blp->blp_total_locks += count;
+	blp->blp_total_blwis++;
 	spin_unlock(&blp->blp_lock);
 
 	wake_up(&blp->blp_waitq);
 
+	/* unlocked read of blp values is intentional - OK for debug */
+	CDEBUG(D_DLMTRACE,
+	       "added %d/%d locks to %s blp list, %d blwis in pool\n",
+	       count, blp->blp_total_locks, prio, blp->blp_total_blwis);
+
 	/*
 	 * can not check blwi->blwi_flags as blwi could be already freed in
 	 * LCF_ASYNC mode
@@ -2749,10 +2770,23 @@ static int ldlm_bl_get_work(struct ldlm_bl_pool *blp,
 		if (++num_bl >= num_th)
 			num_bl = 0;
 		list_del(&blwi->blwi_entry);
+		blp->blp_total_locks -= blwi->blwi_lock ? 1 : blwi->blwi_count;
+		blp->blp_total_blwis--;
 	}
 	spin_unlock(&blp->blp_lock);
 	*p_blwi = blwi;
 
+	/* intentional unlocked read of blp values - OK for debug */
+	if (blwi) {
+		CDEBUG(D_DLMTRACE,
+		       "Got %d locks of %d total in blp.  (%d blwis in pool)\n",
+		       blwi->blwi_lock ? 1 : blwi->blwi_count,
+		       blp->blp_total_locks, blp->blp_total_blwis);
+	} else {
+		CDEBUG(D_DLMTRACE,
+		       "No blwi found in queue (no bl locks in queue)\n");
+	}
+
 	if (*p_exp != NULL && *p_blwi != NULL) {
 		obd_stale_export_put(*p_exp);
 		*p_exp = NULL;
@@ -3293,6 +3327,8 @@ static int ldlm_setup(void)
 	init_waitqueue_head(&blp->blp_waitq);
 	atomic_set(&blp->blp_num_threads, 0);
 	atomic_set(&blp->blp_busy_threads, 0);
+	blp->blp_total_locks = 0;
+	blp->blp_total_blwis = 0;
 
 	if (ldlm_num_threads == 0) {
 		blp->blp_min_threads = LDLM_NTHRS_INIT;
-- 
1.8.3.1