From b4603a9e81239b4e6021c640c1d24e4ed8f8fc4b Mon Sep 17 00:00:00 2001
From: Alexander Boyko <c17825@cray.com>
Date: Mon, 9 Apr 2018 07:48:09 -0400
Subject: [PATCH] LU-10889 ptlrpc: update req timeout if resending happened

When the server drops duplicate request processing, the client and
the server have different deadline for the same request. The server
operates with the first copy and the client operates with the second.

This patch adds request deadline updates if a duplicate request is
found.

A fix for LU-8420 changed lock callback prolong calculation to use
request deadline in case when service estimate changed since the
request has beed created. Using outdated deadline may cause
insufficient prolong timeout and subsequent client eviction.

Signed-off-by: Alexander Boyko <c17825@cray.com>
Signed-off-by: Vladimir Saveliev <c17830@cray.com>
Change-Id: I55725d396f50d864687248df46e7882290fc21ca
Cray-bug-id: MRP-3720 MRP-4289
Reviewed-on: https://review.whamcloud.com/31910
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Vitaly Fertman <c17818@cray.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lustre/ptlrpc/service.c | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c
index 2930f85..38b9d75 100644
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -1546,18 +1546,19 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt)
 
 /* Check if we are already handling earlier incarnation of this request.
  * Called under &req->rq_export->exp_rpc_lock locked */
-static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
+static struct ptlrpc_request*
+ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
 {
 	struct ptlrpc_request	*tmp = NULL;
 
 	if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ||
 	    (atomic_read(&req->rq_export->exp_rpc_count) == 0))
-		return 0;
+		return NULL;
 
 	/* bulk request are aborted upon reconnect, don't try to
 	 * find a match */
 	if (req->rq_bulk_write || req->rq_bulk_read)
-		return 0;
+		return NULL;
 
 	/* This list should not be longer than max_requests in
 	 * flights on the client, so it is not all that long.
@@ -1575,12 +1576,12 @@ static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req)
 		if (tmp->rq_xid == req->rq_xid)
 			goto found;
 	}
-	return 0;
+	return NULL;
 
 found:
 	DEBUG_REQ(D_HA, req, "Found duplicate req in processing");
 	DEBUG_REQ(D_HA, tmp, "Request being processed");
-	return -EBUSY;
+	return tmp;
 }
 
 /**
@@ -1670,6 +1671,7 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 {
 	int rc;
 	bool hp;
+	struct ptlrpc_request *orig;
 	ENTRY;
 
 	rc = ptlrpc_server_hpreq_init(svcpt, req);
@@ -1685,12 +1687,30 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 		/* do search for duplicated xid and the adding to the list
 		 * atomically */
 		spin_lock_bh(&exp->exp_rpc_lock);
-		rc = ptlrpc_server_check_resend_in_progress(req);
-		if (rc < 0) {
+		orig = ptlrpc_server_check_resend_in_progress(req);
+		if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) {
+			bool linked;
+
 			spin_unlock_bh(&exp->exp_rpc_lock);
 
+			/*
+			 * When the client resend request and the server has
+			 * the previous copy of it, we need to update deadlines,
+			 * to be sure that the client and the server have equal
+			 *  request deadlines.
+			 */
+
+			spin_lock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
+			linked = orig->rq_at_linked;
+			if (likely(linked))
+				ptlrpc_at_remove_timed(orig);
+			spin_unlock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock);
+			orig->rq_deadline = req->rq_deadline;
+			if (likely(linked))
+				ptlrpc_at_add_timed(orig);
+			ptlrpc_server_drop_request(orig);
 			ptlrpc_nrs_req_finalize(req);
-			RETURN(rc);
+			RETURN(-EBUSY);
 		}
 
 		if (hp || req->rq_ops != NULL)
-- 
1.8.3.1