From: Andriy Skulysh Date: Mon, 24 Apr 2023 10:54:05 +0000 (+0300) Subject: LU-18111 ptlrpc: don't drop expired cancel request X-Git-Tag: 2.15.90~3 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=3c4387cb61e8a4056ce56ae37ab538e86265fac7;p=fs%2Flustre-release.git LU-18111 ptlrpc: don't drop expired cancel request There is no need to drop expired cancel request by a server because the client will resend the same content. Even if the server is heavy loaded cancel request processing helps to release ldlm resources and avoids spending time on processing of the same resends. Add extra check to prevent same cookie for another client. Change-Id: Ib6e22de72262065c453a390e5563f6ac4212c5a6 HPE-bug-id: LUS-11479, LUS-11595 Reviewed-by: Alexander Boyko Reviewed-by: Alexander Zarochentsev Signed-off-by: Andriy Skulysh Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55946 Tested-by: Maloo Tested-by: jenkins Reviewed-by: Mikhail Pershin Reviewed-by: Oleg Drokin Reviewed-by: Andreas Dilger --- diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index de3fd59..7ad00c2 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1773,10 +1773,16 @@ int ldlm_request_cancel(struct ptlrpc_request *req, lock = ldlm_handle2lock(&dlm_req->lock_handle[i]); if (!lock) { /* below message checked in replay-single.sh test_36 */ - LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llu)", + LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llx)", dlm_req->lock_handle[i].cookie); continue; } + if (lock->l_export != req->rq_export) { + LDLM_DEBUG_NOLOCK("server-side cancel mismatched export (cookie %llx)", + dlm_req->lock_handle[i].cookie); + LDLM_LOCK_PUT(lock); + continue; + } res = lock->l_resource; done++; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index c289844..acaa065 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -2259,6 +2259,7 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, ktime_t arrived; s64 timediff_usecs; s64 arrived_usecs; + __u32 op; int fail_opc = 0; struct obd_device *obd = NULL; @@ -2271,6 +2272,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, if (request->rq_export) obd = request->rq_export->exp_obd; + op = lustre_msg_get_opc(request->rq_reqmsg); + if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) @@ -2314,7 +2317,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, * Discard requests queued for longer than the deadline. * The deadline is increased if we send an early reply. */ - if (ktime_get_real_seconds() > request->rq_deadline) { + if (op != LDLM_CANCEL && + ktime_get_real_seconds() > request->rq_deadline) { DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline %lld/%llds ago", libcfs_idstr(&request->rq_peer), @@ -2332,11 +2336,10 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, (request->rq_export ? refcount_read(&request->rq_export->exp_handle.h_ref) : -99), lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, - libcfs_idstr(&request->rq_peer), - lustre_msg_get_opc(request->rq_reqmsg), + libcfs_idstr(&request->rq_peer), op, lustre_msg_get_jobid(request->rq_reqmsg) ?: ""); - if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) + if (op != OBD_PING) CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); CDEBUG(D_NET, "got req %llu\n", request->rq_xid); @@ -2377,8 +2380,7 @@ put_conn: refcount_read(&request->rq_export->exp_handle.h_ref) : -99), lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, - libcfs_idstr(&request->rq_peer), - lustre_msg_get_opc(request->rq_reqmsg), + libcfs_idstr(&request->rq_peer), op, lustre_msg_get_jobid(request->rq_reqmsg) ?: "", timediff_usecs, arrived_usecs, @@ -2389,7 +2391,6 @@ put_conn: (request->rq_repmsg ? lustre_msg_get_status(request->rq_repmsg) : -999)); if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { - __u32 op = lustre_msg_get_opc(request->rq_reqmsg); int opc = opcode_offset(op); if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {