From 34b2246e4a6c8ce827c404cb4e52f7c6a0a1b90b Mon Sep 17 00:00:00 2001
From: Vitaly Fertman <c17818@cray.com>
Date: Fri, 28 Aug 2020 22:17:58 +0300
Subject: [PATCH] LU-16062 ldlm: improve bl_timeout for prolong

If there is a client's RPC in hand, we can do a better job for
calculating the lock callback timeout as RPC has the info what
client thinks about this RPC timeout. Let's use it.

HPE-bug-id: LUS-8866, LUS-11074
Signed-off-by: Vitaly Fertman <c17818@cray.com>
Change-Id: Ibd67d37c1073d0d3cb2e08b532c801af0de116fe
Reviewed-on: https://es-gerrit.dev.cray.com/157782
Reviewed-by: Andriy Skulysh <c17819@cray.com>
Reviewed-by: Alexey Lyashkov <c17817@cray.com>
Tested-by: Jenkins Build User <nssreleng@cray.com>
Reviewed-on: https://review.whamcloud.com/48094
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/include/lustre_dlm.h |  3 ++-
 lustre/include/lustre_net.h | 24 -----------------------
 lustre/ldlm/ldlm_extent.c   |  7 ++-----
 lustre/ldlm/ldlm_lockd.c    | 48 +++++++++++++++++++++++++++++++++++++++++++++
 lustre/mdt/mdt_io.c         |  2 +-
 lustre/ofd/ofd_dev.c        |  2 +-
 6 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h
index 8fc8127..0b0fd66 100644
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -1395,10 +1395,10 @@ __u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms);
 
 struct ldlm_prolong_args {
 	struct obd_export	*lpa_export;
+	struct ptlrpc_request   *lpa_req;
 	struct ldlm_res_id	lpa_resid;
 	struct ldlm_extent	lpa_extent;
 	enum ldlm_mode		lpa_mode;
-	timeout_t		lpa_timeout;
 	int			lpa_locks_cnt;
 	int			lpa_blocks_cnt;
 };
@@ -1445,6 +1445,7 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
 
 void ldlm_revoke_export_locks(struct obd_export *exp);
 timeout_t ldlm_bl_timeout(struct ldlm_lock *lock);
+timeout_t ldlm_bl_timeout_by_rpc(struct ptlrpc_request *req);
 #endif
 int ldlm_del_waiting_lock(struct ldlm_lock *lock);
 int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, timeout_t timeout);
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index 044caa6..897302c 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -2548,30 +2548,6 @@ ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt)
 	       max_t(int, at, obd_timeout);
 }
 
-/**
- * Calculate the amount of time for lock prolongation.
- *
- * This is helper function to get the timeout extra time.
- *
- * @req		current request
- *
- * Return:	amount of time to extend the timeout with
- */
-static inline timeout_t prolong_timeout(struct ptlrpc_request *req)
-{
-	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
-	timeout_t req_timeout = 0;
-
-	if (AT_OFF)
-		return obd_timeout / 2;
-
-	if (req->rq_deadline > req->rq_arrival_time.tv_sec)
-		req_timeout = req->rq_deadline - req->rq_arrival_time.tv_sec;
-
-	return max(req_timeout,
-		   at_est2timeout(at_get(&svcpt->scp_at_estimate)));
-}
-
 static inline struct ptlrpc_service *
 ptlrpc_req2svc(struct ptlrpc_request *req)
 {
diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c
index ff40ece..4209310 100644
--- a/lustre/ldlm/ldlm_extent.c
+++ b/lustre/ldlm/ldlm_extent.c
@@ -651,16 +651,13 @@ void ldlm_lock_prolong_one(struct ldlm_lock *lock,
 		/* ignore locks not being cancelled */
 		return;
 
-	/* We are in the middle of the process - BL AST is sent, CANCEL
-	 * is ahead. Take half of BL AT + IO AT process time.
-	 */
-	timeout = arg->lpa_timeout + (ldlm_bl_timeout(lock) >> 1);
-
 	arg->lpa_blocks_cnt++;
 
 	/* OK. this is a possible lock the user holds doing I/O
 	 * let's refresh eviction timer for it.
 	 */
+	timeout = ldlm_bl_timeout_by_rpc(arg->lpa_req);
+	LDLM_DEBUG(lock, "refreshed to %ds.\n", timeout);
 	ldlm_refresh_waiting_lock(lock, timeout);
 }
 EXPORT_SYMBOL(ldlm_lock_prolong_one);
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c
index f54b788..a5a49f9 100644
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -648,6 +648,54 @@ timeout_t ldlm_bl_timeout(struct ldlm_lock *lock)
 EXPORT_SYMBOL(ldlm_bl_timeout);
 
 /**
+ * Calculate the per-export Blocking timeout by the given RPC (covering the
+ * reply to this RPC and the next RPC). The next RPC could be still not CANCEL,
+ * but having the lock refresh mechanism it is enough.
+ *
+ * Used for lock refresh timeout when we are in the middle of the process -
+ * BL AST is sent, CANCEL is ahead - it is still 1 reply for the current RPC
+ * and at least 1 RPC (which will trigger another refresh if it will be not
+ * CANCEL) - but more accurate than ldlm_bl_timeout as the timeout is taken
+ * from the RPC (i.e. the view of the client on the current AT) is taken into
+ * account.
+ *
+ * \param[in] req     req which export needs the timeout calculation
+ *
+ * \retval            timeout in seconds to wait for the next client's RPC
+ */
+timeout_t ldlm_bl_timeout_by_rpc(struct ptlrpc_request *req)
+{
+	struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt;
+	timeout_t timeout, req_timeout, at_timeout, netl;
+
+	if (AT_OFF)
+		return obd_timeout / 2;
+
+	/* A blocked lock means somebody in the cluster is waiting, and we
+	 * should not consider the worst ever case, consisting of a chain of
+	 * failures on each step, however this timeout should survive a
+	 * recovery of at least 1 failure, let this one to be the worst one:
+	 * in case a server NID is dead first re-connect is done through the
+	 * same router and also times out.
+	 *
+	 * Either this on the next RPC times out, take the max.
+	 * Considering the current RPC, take just the left time.
+	 */
+	netl = at_get(&req->rq_export->exp_imp_reverse->imp_at.iat_net_latency);
+	req_timeout = req->rq_deadline - ktime_get_real_seconds() + netl;
+	at_timeout = at_est2timeout(at_get(&svcpt->scp_at_estimate)) + netl;
+	req_timeout = max(req_timeout, at_timeout);
+
+	/* Take 1 re-connect failure and 1 re-connect success into account. */
+	timeout = at_timeout + INITIAL_CONNECT_TIMEOUT + netl + req_timeout;
+
+	/* Client's timeout is calculated as at_est2timeout(), let's be a bit
+	 * more conservative than client */
+	return max(timeout + (timeout >> 4), (timeout_t)ldlm_enqueue_min);
+}
+EXPORT_SYMBOL(ldlm_bl_timeout_by_rpc);
+
+/**
  * Perform lock cleanup if AST sending failed.
  */
 static void ldlm_failed_ast(struct ldlm_lock *lock, int rc,
diff --git a/lustre/mdt/mdt_io.c b/lustre/mdt/mdt_io.c
index 8858135..3a9ec7e 100644
--- a/lustre/mdt/mdt_io.c
+++ b/lustre/mdt/mdt_io.c
@@ -100,7 +100,7 @@ static void mdt_prolong_dom_lock(struct tgt_session_info *tsi,
 
 	ENTRY;
 
-	data->lpa_timeout = prolong_timeout(tgt_ses_req(tsi));
+	data->lpa_req = tgt_ses_req(tsi);
 	data->lpa_export = tsi->tsi_exp;
 	data->lpa_resid = tsi->tsi_resid;
 
diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c
index b230132..811a1e7 100644
--- a/lustre/ofd/ofd_dev.c
+++ b/lustre/ofd/ofd_dev.c
@@ -2482,7 +2482,7 @@ static void ofd_prolong_extent_locks(struct tgt_session_info *tsi,
 
 	ENTRY;
 
-	data->lpa_timeout = prolong_timeout(tgt_ses_req(tsi));
+	data->lpa_req = tgt_ses_req(tsi);
 	data->lpa_export = tsi->tsi_exp;
 	data->lpa_resid = tsi->tsi_resid;
 
-- 
1.8.3.1