From b4603a9e81239b4e6021c640c1d24e4ed8f8fc4b Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Mon, 9 Apr 2018 07:48:09 -0400 Subject: [PATCH] LU-10889 ptlrpc: update req timeout if resending happened When the server drops duplicate request processing, the client and the server have different deadline for the same request. The server operates with the first copy and the client operates with the second. This patch adds request deadline updates if a duplicate request is found. A fix for LU-8420 changed lock callback prolong calculation to use request deadline in case when service estimate changed since the request has beed created. Using outdated deadline may cause insufficient prolong timeout and subsequent client eviction. Signed-off-by: Alexander Boyko Signed-off-by: Vladimir Saveliev Change-Id: I55725d396f50d864687248df46e7882290fc21ca Cray-bug-id: MRP-3720 MRP-4289 Reviewed-on: https://review.whamcloud.com/31910 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Vitaly Fertman Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/ptlrpc/service.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 2930f85..38b9d75 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -1546,18 +1546,19 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) /* Check if we are already handling earlier incarnation of this request. * Called under &req->rq_export->exp_rpc_lock locked */ -static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req) +static struct ptlrpc_request* +ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req) { struct ptlrpc_request *tmp = NULL; if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) || (atomic_read(&req->rq_export->exp_rpc_count) == 0)) - return 0; + return NULL; /* bulk request are aborted upon reconnect, don't try to * find a match */ if (req->rq_bulk_write || req->rq_bulk_read) - return 0; + return NULL; /* This list should not be longer than max_requests in * flights on the client, so it is not all that long. @@ -1575,12 +1576,12 @@ static int ptlrpc_server_check_resend_in_progress(struct ptlrpc_request *req) if (tmp->rq_xid == req->rq_xid) goto found; } - return 0; + return NULL; found: DEBUG_REQ(D_HA, req, "Found duplicate req in processing"); DEBUG_REQ(D_HA, tmp, "Request being processed"); - return -EBUSY; + return tmp; } /** @@ -1670,6 +1671,7 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, { int rc; bool hp; + struct ptlrpc_request *orig; ENTRY; rc = ptlrpc_server_hpreq_init(svcpt, req); @@ -1685,12 +1687,30 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, /* do search for duplicated xid and the adding to the list * atomically */ spin_lock_bh(&exp->exp_rpc_lock); - rc = ptlrpc_server_check_resend_in_progress(req); - if (rc < 0) { + orig = ptlrpc_server_check_resend_in_progress(req); + if (orig && likely(atomic_inc_not_zero(&orig->rq_refcount))) { + bool linked; + spin_unlock_bh(&exp->exp_rpc_lock); + /* + * When the client resend request and the server has + * the previous copy of it, we need to update deadlines, + * to be sure that the client and the server have equal + * request deadlines. + */ + + spin_lock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock); + linked = orig->rq_at_linked; + if (likely(linked)) + ptlrpc_at_remove_timed(orig); + spin_unlock(&orig->rq_rqbd->rqbd_svcpt->scp_at_lock); + orig->rq_deadline = req->rq_deadline; + if (likely(linked)) + ptlrpc_at_add_timed(orig); + ptlrpc_server_drop_request(orig); ptlrpc_nrs_req_finalize(req); - RETURN(rc); + RETURN(-EBUSY); } if (hp || req->rq_ops != NULL) -- 1.8.3.1