From cfd5411db998c2b0427e310a19b8741b1ec3644e Mon Sep 17 00:00:00 2001
From: Oleg Drokin <green@whamcloud.com>
Date: Fri, 19 Jan 2024 00:24:43 -0500
Subject: [PATCH] LU-17446 ldlm: Do not wait for BL AST RPC completion on
 cancel

If we have sent an AST RPC to the client and while it's in flight
the client sent in the cancel, sometimes (esp. if AST or reply
to it are lost) even though the lock is already cancelled, whoever
is waiting on it is still stuck while trying to resend ASTs.
And in the end the client is not even evicted because the lock cancel
did come and all is fine, but it can add over a hundred seconds
to lock granting process in some non-ideal circumstances.

For simplicity we only treat Blocking ASTs like this, since we
can only have a single one of this kind.

This is adding additional pointer to struct ldlm_lock, but that is
already 560 bytes so does not really mean much.

Lustre-change: https://review.whamcloud.com/53739
Lustre-commit: TBD (from d4b782c249377276dc9f6ddbf0fab34956d57af6)

Signed-off-by: Oleg Drokin <green@whamcloud.com>
Change-Id: Id2231bc3bfc3e094faae2872fe09f3c330d441df
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53840
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
---
 lustre/include/lustre_dlm.h |  7 ++++
 lustre/include/lustre_net.h |  4 +--
 lustre/ldlm/ldlm_lock.c     | 18 +++++-----
 lustre/ldlm/ldlm_lockd.c    | 86 ++++++++++++++++++++++++++++++++++++++++++++-
 lustre/ptlrpc/client.c      | 14 ++++++++
 lustre/ptlrpc/niobuf.c      |  8 +++--
 6 files changed, 123 insertions(+), 14 deletions(-)

diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h
index 16034df..9d68700 100644
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -1012,6 +1012,13 @@ struct ldlm_lock {
 	 * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
 	 */
 	struct list_head	l_exp_list;
+
+	/**
+	 * Used internally to trace in-flight blocking ASTs of this lock
+	 * There's no locking, so to pick items from here have to use atomic
+	 * exchange primitives
+	 */
+	struct ptlrpc_request	*l_sent_bl_ast;
 };
 
 enum ldlm_match_flags {
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index e71b936..2157fe3 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -725,8 +725,7 @@ typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
                                     struct ptlrpc_request *req,
                                     void *arg, int rc);
 /** Type of request resend call-back */
-typedef void (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req,
-				   void *arg);
+typedef int (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req, void *arg);
 
 /**
  * Definition of request pool structure.
@@ -2058,6 +2057,7 @@ void ptlrpc_restart_req(struct ptlrpc_request *req);
 void ptlrpc_abort_inflight(struct obd_import *imp);
 void ptlrpc_cleanup_imp(struct obd_import *imp);
 void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+void ptlrpc_req_abort(struct ptlrpc_request *req);
 
 struct ptlrpc_request_set *ptlrpc_prep_set(void);
 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c
index 06626ae..aaeb892 100644
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -2808,7 +2808,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 
         if (resource == NULL) {
 		libcfs_debug_msg(msgdata,
-				 "%pV ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+				 "%pV ns: \?\? lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
 				 &vaf,
 				 lock,
 				 lock->l_handle.h_cookie,
@@ -2820,7 +2820,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 				 lock->l_remote_handle.cookie,
 				 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
 				 lock->l_pid, lock->l_callback_timestamp,
-				 lock->l_lvb_type);
+				 lock->l_lvb_type, lock->l_sent_bl_ast);
                 va_end(args);
                 return;
         }
@@ -2828,7 +2828,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 	switch (resource->lr_type) {
 	case LDLM_EXTENT:
 		libcfs_debug_msg(msgdata,
-				 "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+				 "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
 				 &vaf,
 				 ldlm_lock_to_ns_name(lock), lock,
 				 lock->l_handle.h_cookie,
@@ -2847,12 +2847,12 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 				 lock->l_remote_handle.cookie,
 				 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
 				 lock->l_pid, lock->l_callback_timestamp,
-				 lock->l_lvb_type);
+				 lock->l_lvb_type, lock->l_sent_bl_ast);
 		break;
 
 	case LDLM_FLOCK:
 		libcfs_debug_msg(msgdata,
-				 "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld\n",
+				 "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld\n",
 				 &vaf,
 				 ldlm_lock_to_ns_name(lock), lock,
 				 lock->l_handle.h_cookie,
@@ -2874,7 +2874,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 
 	case LDLM_IBITS:
 		libcfs_debug_msg(msgdata,
-				 "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+				 "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
 				 &vaf,
 				 ldlm_lock_to_ns_name(lock),
 				 lock, lock->l_handle.h_cookie,
@@ -2892,12 +2892,12 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 				 lock->l_remote_handle.cookie,
 				 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
 				 lock->l_pid, lock->l_callback_timestamp,
-				 lock->l_lvb_type);
+				 lock->l_lvb_type, lock->l_sent_bl_ast);
 		break;
 
 	default:
 		libcfs_debug_msg(msgdata,
-				 "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+				 "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
 				 &vaf,
 				 ldlm_lock_to_ns_name(lock),
 				 lock, lock->l_handle.h_cookie,
@@ -2912,7 +2912,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 				 lock->l_remote_handle.cookie,
 				 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
 				 lock->l_pid, lock->l_callback_timestamp,
-				 lock->l_lvb_type);
+				 lock->l_lvb_type, lock->l_sent_bl_ast);
 		break;
 	}
 	va_end(args);
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c
index 18d7eec..15ba39a 100644
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -731,6 +731,10 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 {
 	struct lnet_process_id peer = req->rq_import->imp_connection->c_peer;
 
+	/* This AST was cancelled by lock cancel/convert */
+	if (rc == -ECANCELED)
+		return rc;
+
 	if (!req->rq_replied || (rc && rc != -EINVAL)) {
 		if (ldlm_is_cancel(lock)) {
 			LDLM_DEBUG(lock,
@@ -833,6 +837,36 @@ static int ldlm_cb_interpret(const struct lu_env *env,
 		}
 		break;
 	case LDLM_BL_CALLBACK:
+		/* if nobody grabbed our linked request, decref to match
+		 * acquisition.
+		 * Three cases are possible here:
+		 * - l_sent_bl_ast is equal to our request - we'll free it
+		 * - it's equal to NULL - so cancel / convert thread grabbed it
+		 * - it's not NULL, but not equal to req - this is a convert
+		 *   race, where lock convert came, and by the time we got here,
+		 *   the lock got another BL AST that got recorded here.
+		 *
+		 * This 3rd case is the most annoying handling wise. Once we
+		 * acquired this request with atomic exchange there's no safe
+		 * way to put it back witohut risking the other thread missing
+		 * it so we have to release it here anyway and bring the
+		 * original problem back - if the other AST won't be replied
+		 * in time, the other blocked thread would wait for a long time.
+		 *
+		 * To minimize this window, we'll do an unsafe check first,
+		 * it's ok to check sent_bl_ast for both NULL and non-req value
+		 * because once those values were seen, they cannot magically
+		 * return to the req value. Either way whatever we got we must
+		 * free if not NULL.
+		 */
+		if (lock->l_sent_bl_ast != NULL && lock->l_sent_bl_ast == req) {
+			struct ptlrpc_request *saved_req;
+
+			saved_req = xchg_acquire(&lock->l_sent_bl_ast, NULL);
+			if (saved_req)
+				ptlrpc_req_finished(saved_req);
+		}
+
 		if (rc != 0)
 			rc = ldlm_handle_ast_error(lock, req, rc, "blocking");
 		break;
@@ -855,12 +889,23 @@ static int ldlm_cb_interpret(const struct lu_env *env,
 	RETURN(0);
 }
 
-static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
+static int ldlm_update_resend(struct ptlrpc_request *req, void *data)
 {
 	struct ldlm_cb_async_args *ca = data;
 	struct ldlm_lock *lock = ca->ca_lock;
 
+	/* See if the lock was already canceled, if so, don't do any
+	 * resends, I am afraid to use lock_is_granted because what if
+	 * we've got a lock convert? Or is it also succeptible to this kind
+	 * of problems?
+	 */
+	if (lock->l_granted_mode == LCK_NL) {
+		req->rq_delay_limit = 1;
+		return -ESTALE;
+	}
+
 	ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
+	return 0;
 }
 
 static inline int ldlm_ast_fini(struct ptlrpc_request *req,
@@ -1015,6 +1060,21 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
 		/* Do not resend after lock callback timeout */
 		req->rq_delay_limit = ldlm_bl_timeout(lock);
 		req->rq_resend_cb = ldlm_update_resend;
+
+		/* We remember the request here so we can kill the request
+		 * if the lock cancel/conversion comes before AST reply.
+		 * For simplicity we only do this for blocking AST because
+		 * there could be only one, so we don't need to do
+		 * the whole list + locking mess.
+		 * It does not help that there's no lock-specific spinlocks
+		 * in struct ldlm_lock.
+		 * To safely use this stored RPC caller pretty much has to
+		 * use atomic exchange with NULL to avoid situation of two
+		 * threads working on the same RPC.
+		 */
+		LASSERT(!lock->l_sent_bl_ast);
+		ptlrpc_request_addref(req);
+		lock->l_sent_bl_ast = req;
 	}
 
 	req->rq_send_state = LUSTRE_IMP_FULL;
@@ -1649,6 +1709,7 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
 	struct obd_export *exp = req->rq_export;
 	struct ldlm_reply *dlm_rep;
 	struct ldlm_lock *lock;
+	struct ptlrpc_request *ast_req;
 	__u64 bits;
 	__u64 new_bits;
 	int rc;
@@ -1691,6 +1752,8 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
 		GOTO(out_put, rc = -EPROTO);
 	}
 
+	/* Grab AST RPC for later abort */
+	ast_req = xchg_acquire(&lock->l_sent_bl_ast, NULL);
 	if (bits == new_bits) {
 		/*
 		 * This can be valid situation if CONVERT RPCs are
@@ -1712,6 +1775,14 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
 		ldlm_reprocess_all(lock->l_resource, NULL);
 	}
 
+	/* See if there's an oustanding AST RPC for this lock
+	 * and if yes - kill it.
+	 */
+	if (ast_req) {
+		ptlrpc_req_abort(ast_req);
+		ptlrpc_req_finished(ast_req);
+	}
+
 	dlm_rep->lock_handle = lock->l_remote_handle;
 	ldlm_ibits_policy_local_to_wire(&lock->l_policy_data,
 					&dlm_rep->lock_desc.l_policy_data);
@@ -1738,6 +1809,7 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 	struct ldlm_lock *lock;
 	int i, count, done = 0;
 	unsigned int size;
+	struct ptlrpc_request *ast_req;
 
 	ENTRY;
 
@@ -1810,7 +1882,19 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 				   delay);
 			at_measured(&lock->l_export->exp_bl_lock_at, delay);
 		}
+
+		/* We grab our reference early to combat races */
+		ast_req = xchg_acquire(&lock->l_sent_bl_ast, NULL);
+
 		ldlm_lock_cancel(lock);
+		/* See if there's an oustanding AST RPC for this lock
+		 * and if yes - kill it.
+		 */
+		if (ast_req) {
+			ptlrpc_req_abort(ast_req);
+			ptlrpc_req_finished(ast_req);
+		}
+
 		LDLM_LOCK_PUT(lock);
 	}
 	if (pres != NULL) {
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index a7f5f37..ab56bc5 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -3484,6 +3484,20 @@ void ptlrpc_abort_set(struct ptlrpc_request_set *set)
 	}
 }
 
+void ptlrpc_req_abort(struct ptlrpc_request *req)
+{
+	spin_lock(&req->rq_lock);
+	req->rq_err = 1;
+	req->rq_status = -ECANCELED;
+
+	if (req->rq_phase != RQ_PHASE_RPC) {
+		spin_unlock(&req->rq_lock);
+		return;
+	}
+	ptlrpc_client_wake_req(req);
+	spin_unlock(&req->rq_lock);
+}
+
 /**
  * Initialize the XID for the node.  This is common among all requests on
  * this node, and only requires the property that it is monotonically
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c
index d4c0b71..7713a31 100644
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -794,8 +794,12 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
 
 	if (request->rq_resend) {
 		lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
-		if (request->rq_resend_cb != NULL)
-			request->rq_resend_cb(request, &request->rq_async_args);
+		if (request->rq_resend_cb != NULL) {
+			rc = request->rq_resend_cb(request,
+						   &request->rq_async_args);
+			if (rc)
+				return rc;
+		}
 	}
 	if (request->rq_memalloc)
 		mpflag = memalloc_noreclaim_save();
-- 
1.8.3.1