From: Andriy Skulysh Date: Mon, 24 Apr 2023 10:54:05 +0000 (+0300) Subject: LU-18111 ptlrpc: don't drop expired cancel request X-Git-Tag: 2.15.7-RC1~28 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F69%2F58769%2F3;p=fs%2Flustre-release.git LU-18111 ptlrpc: don't drop expired cancel request There is no need to drop expired cancel request by a server because the client will resend the same content. Even if the server is heavy loaded cancel request processing helps to release ldlm resources and avoids spending time on processing of the same resends. Add extra check to prevent same cookie for another client. Lustre-change: https://review.whamcloud.com/55946 Lustre-commit: 3c4387cb61e8a4056ce56ae37ab538e86265fac7 Change-Id: Ib6e22de72262065c453a390e5563f6ac4212c5a6 HPE-bug-id: LUS-11479, LUS-11595 Reviewed-by: Alexander Boyko Reviewed-by: Alexander Zarochentsev Signed-off-by: Andriy Skulysh Reviewed-by: Mikhail Pershin Reviewed-by: Andreas Dilger Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58769 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 06b4350..c9d6d31 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1724,10 +1724,16 @@ int ldlm_request_cancel(struct ptlrpc_request *req, lock = ldlm_handle2lock(&dlm_req->lock_handle[i]); if (!lock) { /* below message checked in replay-single.sh test_36 */ - LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llu)", + LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llx)", dlm_req->lock_handle[i].cookie); continue; } + if (lock->l_export != req->rq_export) { + LDLM_DEBUG_NOLOCK("server-side cancel mismatched export (cookie %llx)", + dlm_req->lock_handle[i].cookie); + LDLM_LOCK_PUT(lock); + continue; + } res = lock->l_resource; done++; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 63c1ac0..98ed97f 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -2237,6 +2237,7 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, ktime_t arrived; s64 timediff_usecs; s64 arrived_usecs; + __u32 op; int fail_opc = 0; ENTRY; @@ -2245,6 +2246,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, if (request == NULL) RETURN(0); + op = lustre_msg_get_opc(request->rq_reqmsg); + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) @@ -2286,7 +2289,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, * Discard requests queued for longer than the deadline. * The deadline is increased if we send an early reply. */ - if (ktime_get_real_seconds() > request->rq_deadline) { + if (op != LDLM_CANCEL && + ktime_get_real_seconds() > request->rq_deadline) { DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline %lld/%llds ago", libcfs_id2str(request->rq_peer), @@ -2304,11 +2308,10 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, (request->rq_export ? refcount_read(&request->rq_export->exp_handle.h_ref) : -99), lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, - libcfs_id2str(request->rq_peer), - lustre_msg_get_opc(request->rq_reqmsg), + libcfs_id2str(request->rq_peer), op, lustre_msg_get_jobid(request->rq_reqmsg) ?: ""); - if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) + if (op != OBD_PING) CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); CDEBUG(D_NET, "got req %llu\n", request->rq_xid); @@ -2345,8 +2348,7 @@ put_conn: refcount_read(&request->rq_export->exp_handle.h_ref) : -99), lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, - libcfs_id2str(request->rq_peer), - lustre_msg_get_opc(request->rq_reqmsg), + libcfs_id2str(request->rq_peer), op, lustre_msg_get_jobid(request->rq_reqmsg) ?: "", timediff_usecs, arrived_usecs, @@ -2357,7 +2359,6 @@ put_conn: (request->rq_repmsg ? lustre_msg_get_status(request->rq_repmsg) : -999)); if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) { - __u32 op = lustre_msg_get_opc(request->rq_reqmsg); int opc = opcode_offset(op); if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {