From 0b09d826149f4baadce305df63396bf86eb20cf7 Mon Sep 17 00:00:00 2001
From: Sebastien Buisson <sbuisson@ddn.com>
Date: Sat, 8 Feb 2020 02:16:02 +0900
Subject: [PATCH] LU-13216 ptlrpc: sptlrpc_req_refresh_ctx's timeout semantic

sptlrpc_req_refresh_ctx() is only called with two possible timeouts
in the current code: either we do not wait at all, or we wait
indefinitely.
So change semantic of sptlrpc_req_refresh_ctx()'s timeout parameter,
to be more consistent with wait* primitives. timeout == 0 means
'do not wait at all', and timeout == MAX_SCHEDULE_TIMEOUT means
'wait indefinitely'. Other timeout values are unsupported.

Consequently, remove code that handles the case of a non zero and
non infinite timeout. And remove the unsed ctx_refresh_timeout()
callback.
It solves a problem in the current code, because in case of indefinite
wait, sptlrpc_req_refresh_ctx() is not supposed to try to refresh the
request, so it must not call the ctx_refresh_timeout() callback.

Fixes: c1fad6a9a5 ("LU-10467 ptlrpc: convert waiting in sptlrpc_req_refresh_ctx()")
Test-Parameters: trivial envdefinitions=SHARED_KEY=true testlist=sanity-sec
Test-Parameters: envdefinitions=SHARED_KEY=true testlist=sanity-sec
Test-Parameters: envdefinitions=SHARED_KEY=true testlist=sanity-sec
Test-Parameters: envdefinitions=SHARED_KEY=true testlist=sanity-sec
Test-Parameters: envdefinitions=SHARED_KEY=true testlist=sanity-sec
Signed-off-by: Sebastien Buisson <sbuisson@ddn.com>
Change-Id: I1b20457c86863d57f97f4d998f7c5fa8ec928b31
Reviewed-on: https://review.whamcloud.com/37473
Reviewed-by: Neil Brown <neilb@suse.de>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/ptlrpc/client.c |  4 ++--
 lustre/ptlrpc/sec.c    | 54 ++++++++++++++++----------------------------------
 2 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index 2ce24c9..758d86e 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -1673,7 +1673,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req)
 
 	lustre_msg_set_status(req->rq_reqmsg, current->pid);
 
-	rc = sptlrpc_req_refresh_ctx(req, -1);
+	rc = sptlrpc_req_refresh_ctx(req, 0);
 	if (rc) {
 		if (req->rq_err) {
 			req->rq_status = rc;
@@ -1990,7 +1990,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
 				 * rq_wait_ctx is only touched by ptlrpcd,
 				 * so no lock is needed here.
 				 */
-				status = sptlrpc_req_refresh_ctx(req, -1);
+				status = sptlrpc_req_refresh_ctx(req, 0);
 				if (status) {
 					if (req->rq_err) {
 						req->rq_status = status;
diff --git a/lustre/ptlrpc/sec.c b/lustre/ptlrpc/sec.c
index 0e79684..dc68e0e 100644
--- a/lustre/ptlrpc/sec.c
+++ b/lustre/ptlrpc/sec.c
@@ -628,27 +628,6 @@ int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx)
 }
 
 static
-int ctx_refresh_timeout(struct ptlrpc_request *req)
-{
-	int rc;
-
-	/* conn_cnt is needed in expire_one_request */
-	lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt);
-
-	rc = ptlrpc_expire_one_request(req, 1);
-	/*
-	 * if we started recovery, we should mark this ctx dead; otherwise
-	 * in case of lgssd died nobody would retire this ctx, following
-	 * connecting will still find the same ctx thus cause deadlock.
-	 * there's an assumption that expire time of the request should be
-	 * later than the context refresh expire time.
-	 */
-	if (rc == 0)
-		req->rq_cli_ctx->cc_ops->die(req->rq_cli_ctx, 0);
-	return rc;
-}
-
-static
 void ctx_refresh_interrupt(struct ptlrpc_request *req)
 {
 
@@ -669,9 +648,9 @@ void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx)
 /**
  * To refresh the context of \req, if it's not up-to-date.
  * \param timeout
- * - < 0: don't wait
- * - = 0: wait until success or fatal error occur
- * - > 0: timeout value (in seconds)
+ * - == 0: do not wait
+ * - == MAX_SCHEDULE_TIMEOUT: wait indefinitely
+ * - > 0: not supported
  *
  * The status of the context could be subject to be changed by other threads
  * at any time. We allow this race, but once we return with 0, the caller will
@@ -693,6 +672,11 @@ int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout)
 	if (req->rq_ctx_init || req->rq_ctx_fini)
 		RETURN(0);
 
+	if (timeout != 0 && timeout != MAX_SCHEDULE_TIMEOUT) {
+		CERROR("req %p: invalid timeout %lu\n", req, timeout);
+		RETURN(-EINVAL);
+	}
+
 	/*
 	 * during the process a request's context might change type even
 	 * (e.g. from gss ctx to null ctx), so each loop we need to re-check
@@ -805,7 +789,7 @@ again:
 		list_add(&req->rq_ctx_chain, &ctx->cc_req_list);
 	spin_unlock(&ctx->cc_lock);
 
-	if (timeout < 0)
+	if (timeout == 0)
 		RETURN(-EWOULDBLOCK);
 
 	/* Clear any flags that may be present from previous sends */
@@ -817,17 +801,13 @@ again:
 	req->rq_restart = 0;
 	spin_unlock(&req->rq_lock);
 
-	if (wait_event_idle_timeout(req->rq_reply_waitq,
-				    ctx_check_refresh(ctx),
-				    cfs_time_seconds(timeout)) == 0) {
-		rc = -ETIMEDOUT;
-		if (!ctx_refresh_timeout(req) &&
-		    l_wait_event_abortable(req->rq_reply_waitq,
-					   ctx_check_refresh(ctx))
-		    == -ERESTARTSYS) {
-			rc = -EINTR;
-			ctx_refresh_interrupt(req);
-		}
+	/* by now we know that timeout value is MAX_SCHEDULE_TIMEOUT,
+	 * so wait indefinitely with non-fatal signals blocked
+	 */
+	if (l_wait_event_abortable(req->rq_reply_waitq,
+				   ctx_check_refresh(ctx)) == -ERESTARTSYS) {
+		rc = -EINTR;
+		ctx_refresh_interrupt(req);
 	}
 
 	/*
@@ -981,7 +961,7 @@ int sptlrpc_import_check_ctx(struct obd_import *imp)
 	req->rq_flvr = sec->ps_flvr;
 	req->rq_cli_ctx = ctx;
 
-	rc = sptlrpc_req_refresh_ctx(req, 0);
+	rc = sptlrpc_req_refresh_ctx(req, MAX_SCHEDULE_TIMEOUT);
 	LASSERT(list_empty(&req->rq_ctx_chain));
 	sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1);
 	ptlrpc_request_cache_free(req);
-- 
1.8.3.1