Whamcloud - gitweb
LU-18111 ptlrpc: don't drop expired cancel request 46/55946/3
authorAndriy Skulysh <andriy.skulysh@hpe.com>
Mon, 24 Apr 2023 10:54:05 +0000 (13:54 +0300)
committerOleg Drokin <green@whamcloud.com>
Sun, 25 Aug 2024 16:33:35 +0000 (16:33 +0000)
There is no need to drop expired cancel request by
a server because the client will resend the same content.
Even if the server is heavy loaded cancel request processing
helps to release ldlm resources and avoids spending time
on processing of the same resends.

Add extra check to prevent same cookie for another client.

Change-Id: Ib6e22de72262065c453a390e5563f6ac4212c5a6
HPE-bug-id: LUS-11479, LUS-11595
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-by: Alexander Zarochentsev <alexander.zarochentsev@hpe.com>
Signed-off-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55946
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Mikhail Pershin <mpershin@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lustre/ldlm/ldlm_lockd.c
lustre/ptlrpc/service.c

index de3fd59..7ad00c2 100644 (file)
@@ -1773,10 +1773,16 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
                lock = ldlm_handle2lock(&dlm_req->lock_handle[i]);
                if (!lock) {
                        /* below message checked in replay-single.sh test_36 */
-                       LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llu)",
+                       LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock (cookie %llx)",
                                          dlm_req->lock_handle[i].cookie);
                        continue;
                }
+               if (lock->l_export != req->rq_export) {
+                       LDLM_DEBUG_NOLOCK("server-side cancel mismatched export (cookie %llx)",
+                                       dlm_req->lock_handle[i].cookie);
+                       LDLM_LOCK_PUT(lock);
+                       continue;
+               }
 
                res = lock->l_resource;
                done++;
index c289844..acaa065 100644 (file)
@@ -2259,6 +2259,7 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
        ktime_t arrived;
        s64 timediff_usecs;
        s64 arrived_usecs;
+       __u32 op;
        int fail_opc = 0;
        struct obd_device *obd = NULL;
 
@@ -2271,6 +2272,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
        if (request->rq_export)
                obd = request->rq_export->exp_obd;
 
+       op = lustre_msg_get_opc(request->rq_reqmsg);
+
        if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT))
                fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT;
        else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT))
@@ -2314,7 +2317,8 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
         * Discard requests queued for longer than the deadline.
         * The deadline is increased if we send an early reply.
         */
-       if (ktime_get_real_seconds() > request->rq_deadline) {
+       if (op != LDLM_CANCEL &&
+           ktime_get_real_seconds() > request->rq_deadline) {
                DEBUG_REQ(D_ERROR, request,
                          "Dropping timed-out request from %s: deadline %lld/%llds ago",
                          libcfs_idstr(&request->rq_peer),
@@ -2332,11 +2336,10 @@ static int ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt,
               (request->rq_export ?
                refcount_read(&request->rq_export->exp_handle.h_ref) : -99),
               lustre_msg_get_status(request->rq_reqmsg), request->rq_xid,
-              libcfs_idstr(&request->rq_peer),
-              lustre_msg_get_opc(request->rq_reqmsg),
+              libcfs_idstr(&request->rq_peer), op,
               lustre_msg_get_jobid(request->rq_reqmsg) ?: "");
 
-       if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING)
+       if (op != OBD_PING)
                CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val);
 
        CDEBUG(D_NET, "got req %llu\n", request->rq_xid);
@@ -2377,8 +2380,7 @@ put_conn:
                refcount_read(&request->rq_export->exp_handle.h_ref) : -99),
               lustre_msg_get_status(request->rq_reqmsg),
               request->rq_xid,
-              libcfs_idstr(&request->rq_peer),
-              lustre_msg_get_opc(request->rq_reqmsg),
+              libcfs_idstr(&request->rq_peer), op,
               lustre_msg_get_jobid(request->rq_reqmsg) ?: "",
               timediff_usecs,
               arrived_usecs,
@@ -2389,7 +2391,6 @@ put_conn:
               (request->rq_repmsg ?
               lustre_msg_get_status(request->rq_repmsg) : -999));
        if (likely(svc->srv_stats != NULL && request->rq_reqmsg != NULL)) {
-               __u32 op = lustre_msg_get_opc(request->rq_reqmsg);
                int opc = opcode_offset(op);
 
                if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) {