Whamcloud - gitweb
LU-18111 ptlrpc: don't drop expired cancel request 69/58769/3
authorAndriy Skulysh <andriy.skulysh@hpe.com>
Mon, 24 Apr 2023 10:54:05 +0000 (13:54 +0300)
committerOleg Drokin <green@whamcloud.com>
Fri, 9 May 2025 01:45:28 +0000 (01:45 +0000)
There is no need to drop expired cancel request by
a server because the client will resend the same content.
Even if the server is heavy loaded cancel request processing
helps to release ldlm resources and avoids spending time
on processing of the same resends.

Add extra check to prevent same cookie for another client.

Lustre-change: https://review.whamcloud.com/55946
Lustre-commit: 3c4387cb61e8a4056ce56ae37ab538e86265fac7

Change-Id: Ib6e22de72262065c453a390e5563f6ac4212c5a6
HPE-bug-id: LUS-11479, LUS-11595
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@hpe.com>
Signed-off-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Mikhail Pershin <mpershin@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58769
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/ldlm/ldlm_lockd.c
lustre/ptlrpc/service.c

index 06b4350..c9d6d31 100644 (file)
@@ -1724,10 +1724,16 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
                lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
                if (!lock) {
                        /* below message checked in replay-single.sh test_36 */
-                       LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llu)",
+                       LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llx)",
                                          dlm_req->lock_handle[i].cookie);
                        continue;
                }
+               if (lock->l_export != req->rq_export) {
+                       LDLM_DEBUG_NOLOCK("server-side cancel mismatched export (cookie %llx)",
+                                       dlm_req->lock_handle[i].cookie);
+                       LDLM_LOCK_PUT(lock);
+                       continue;
+               }
 
                res = lock->l_resource;
                done++;
index 63c1ac0..98ed97f 100644 (file)
@@ -2237,6 +2237,7 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
        ktime_t arrived;
        s64 timediff_usecs;
        s64 arrived_usecs;
+       __u32 op;
        int fail_opc = 0;
 
        ENTRY;
@@ -2245,6 +2246,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
        if (request == NULL)
                RETURN(0);
 
+       op = lustre_msg_get_opc(request->rq_reqmsg);
+
        if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
                fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
        else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
@@ -2286,7 +2289,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
         * Discard requests queued for longer than the deadline.
         * The deadline is increased if we send an early reply.
         */
-       if (ktime_get_real_seconds() > request->rq_deadline) {
+       if (op != LDLM_CANCEL &&
+           ktime_get_real_seconds() > request->rq_deadline) {
                DEBUG_REQ(D_ERROR, request,
                          "Dropping timed-out request from %s: deadline %lld/%llds ago",
                          libcfs_id2str(request->rq_peer),
@@ -2304,11 +2308,10 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
               (request->rq_export ?
                refcount_read(&request->rq_export->exp_handle.h_ref) : -99),
               lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
-              libcfs_id2str(request->rq_peer),
-              lustre_msg_get_opc(request->rq_reqmsg),
+              libcfs_id2str(request->rq_peer), op,
               lustre_msg_get_jobid(request->rq_reqmsg) ?: "");
 
-       if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+       if (op != OBD_PING)
                CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
 
        CDEBUG(D_NET, "got req %llu\n", request->rq_xid);
@@ -2345,8 +2348,7 @@ put_conn:
                refcount_read(&request->rq_export->exp_handle.h_ref) : -99),
               lustre_msg_get_status(request->rq_reqmsg),
               request->rq_xid,
-              libcfs_id2str(request->rq_peer),
-              lustre_msg_get_opc(request->rq_reqmsg),
+              libcfs_id2str(request->rq_peer), op,
               lustre_msg_get_jobid(request->rq_reqmsg) ?: "",
               timediff_usecs,
               arrived_usecs,
@@ -2357,7 +2359,6 @@ put_conn:
               (request->rq_repmsg ?
               lustre_msg_get_status(request->rq_repmsg) : -999));
        if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
-               __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
                int opc = opcode_offset(op);
 
                if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {