Whamcloud - gitweb
LU-17446 ldlm: Do not wait for BL AST RPC completion on cancel 39/53739/12
authorOleg Drokin <green@whamcloud.com>
Fri, 19 Jan 2024 05:24:43 +0000 (00:24 -0500)
committerAndreas Dilger <adilger@whamcloud.com>
Sat, 27 Jan 2024 18:24:25 +0000 (18:24 +0000)
If we have sent an AST RPC to the client and while it's in flight
the client sent in the cancel, sometimes (esp. if AST or reply
to it are lost) even though the lock is already cancelled, whoever
is waiting on it is still stuck while trying to resend ASTs.
And in the end the client is not even evicted because the lock cancel
did come and all is fine, but it can add over a hundred seconds
to lock granting process in some non-ideal circumstances.

For simplicity we only treat Blocking ASTs like this, since we
can only have a single one of this kind.

This is adding additional pointer to struct ldlm_lock, but that is
already 560 bytes so does not really mean much.

Change-Id: Id2231bc3bfc3e094faae2872fe09f3c330d441df
Signed-off-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_dlm.h
lustre/include/lustre_net.h
lustre/ldlm/ldlm_lock.c
lustre/ldlm/ldlm_lockd.c
lustre/ptlrpc/client.c
lustre/ptlrpc/niobuf.c

index 82e7a55..0d86774 100644 (file)
@@ -1021,6 +1021,13 @@ struct ldlm_lock {
         * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
         */
        struct list_head        l_exp_list;
+
+       /**
+        * Used internally to trace in-flight blocking ASTs of this lock
+        * There's no locking, so to pick items from here have to use atomic
+        * exchange primitives
+        */
+       struct ptlrpc_request   *l_sent_bl_ast;
 };
 
 enum ldlm_match_flags {
index 4b12182..bff5b7c 100644 (file)
@@ -725,8 +725,7 @@ typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
                                     struct ptlrpc_request *req,
                                     void *arg, int rc);
 /** Type of request resend call-back */
-typedef void (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req,
-                                  void *arg);
+typedef int (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req, void *arg);
 
 /**
  * Definition of request pool structure.
@@ -2094,6 +2093,7 @@ void ptlrpc_restart_req(struct ptlrpc_request *req);
 void ptlrpc_abort_inflight(struct obd_import *imp);
 void ptlrpc_cleanup_imp(struct obd_import *imp);
 void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+void ptlrpc_req_abort(struct ptlrpc_request *req);
 
 struct ptlrpc_request_set *ptlrpc_prep_set(void);
 struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
index 849e039..b86402e 100644 (file)
@@ -2832,7 +2832,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
 
         if (resource == NULL) {
                libcfs_debug_msg(msgdata,
-                                "%pV ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+                                "%pV ns: \?\? lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
                                 &vaf,
                                 lock,
                                 lock->l_handle.h_cookie,
@@ -2844,7 +2844,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                 lock->l_remote_handle.cookie,
                                 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
                                 lock->l_pid, lock->l_callback_timestamp,
-                                lock->l_lvb_type);
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                 va_end(args);
                 return;
         }
@@ -2852,7 +2852,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
        switch (resource->lr_type) {
        case LDLM_EXTENT:
                libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
                                 &vaf,
                                 ldlm_lock_to_ns_name(lock), lock,
                                 lock->l_handle.h_cookie,
@@ -2871,12 +2871,12 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                 lock->l_remote_handle.cookie,
                                 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
                                 lock->l_pid, lock->l_callback_timestamp,
-                                lock->l_lvb_type);
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                break;
 
        case LDLM_FLOCK:
                libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld\n",
                                 &vaf,
                                 ldlm_lock_to_ns_name(lock), lock,
                                 lock->l_handle.h_cookie,
@@ -2899,7 +2899,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
        case LDLM_IBITS:
                if (!lock->l_remote_handle.cookie)
                        libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s flags: %#llx pid: %u initiator: MDT%d\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s flags: %#llx pid: %u initiator: MDT%04x, lvb_type: %d/%pK\n",
                                 &vaf,
                                 ldlm_lock_to_ns_name(lock),
                                 lock, lock->l_handle.h_cookie,
@@ -2913,10 +2913,11 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                 atomic_read(&resource->lr_refcount),
                                 ldlm_typename[resource->lr_type],
                                 lock->l_flags, lock->l_pid,
-                                lock->l_policy_data.l_inodebits.li_initiator_id);
+                                lock->l_policy_data.l_inodebits.li_initiator_id,
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                else
                        libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
                                 &vaf,
                                 ldlm_lock_to_ns_name(lock),
                                 lock, lock->l_handle.h_cookie,
@@ -2934,12 +2935,12 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                 lock->l_remote_handle.cookie,
                                 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
                                 lock->l_pid, lock->l_callback_timestamp,
-                                lock->l_lvb_type);
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                break;
 
        default:
                libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
                                 &vaf,
                                 ldlm_lock_to_ns_name(lock),
                                 lock, lock->l_handle.h_cookie,
@@ -2954,7 +2955,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                 lock->l_remote_handle.cookie,
                                 exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
                                 lock->l_pid, lock->l_callback_timestamp,
-                                lock->l_lvb_type);
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                break;
        }
        va_end(args);
index 8651412..0fd0300 100644 (file)
@@ -741,6 +741,10 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
 {
        struct lnet_processid *peer = &req->rq_import->imp_connection->c_peer;
 
+       /* This AST was cancelled by lock cancel/convert */
+       if (rc == -ECANCELED)
+               return rc;
+
        if (!req->rq_replied || (rc && rc != -EINVAL)) {
                if (ldlm_is_cancel(lock)) {
                        LDLM_DEBUG(lock,
@@ -843,6 +847,36 @@ static int ldlm_cb_interpret(const struct lu_env *env,
                }
                break;
        case LDLM_BL_CALLBACK:
+               /* if nobody grabbed our linked request, decref to match
+                * acquisition.
+                * Three cases are possible here:
+                * - l_sent_bl_ast is equal to our request - we'll free it
+                * - it's equal to NULL - so cancel / convert thread grabbed it
+                * - it's not NULL, but not equal to req - this is a convert
+                *   race, where lock convert came, and by the time we got here,
+                *   the lock got another BL AST that got recorded here.
+                *
+                * This 3rd case is the most annoying handling wise. Once we
+                * acquired this request with atomic exchange there's no safe
+                * way to put it back witohut risking the other thread missing
+                * it so we have to release it here anyway and bring the
+                * original problem back - if the other AST won't be replied
+                * in time, the other blocked thread would wait for a long time.
+                *
+                * To minimize this window, we'll do an unsafe check first,
+                * it's ok to check sent_bl_ast for both NULL and non-req value
+                * because once those values were seen, they cannot magically
+                * return to the req value. Either way whatever we got we must
+                * free if not NULL.
+                */
+               if (lock->l_sent_bl_ast != NULL && lock->l_sent_bl_ast == req) {
+                       struct ptlrpc_request *saved_req;
+
+                       saved_req = xchg_acquire(&lock->l_sent_bl_ast, NULL);
+                       if (saved_req)
+                               ptlrpc_req_finished(saved_req);
+               }
+
                if (rc != 0)
                        rc = ldlm_handle_ast_error(lock, req, rc, "blocking");
                break;
@@ -865,12 +899,23 @@ static int ldlm_cb_interpret(const struct lu_env *env,
        RETURN(0);
 }
 
-static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
+static int ldlm_update_resend(struct ptlrpc_request *req, void *data)
 {
        struct ldlm_cb_async_args *ca = data;
        struct ldlm_lock *lock = ca->ca_lock;
 
+       /* See if the lock was already canceled, if so, don't do any
+        * resends, I am afraid to use lock_is_granted because what if
+        * we've got a lock convert? Or is it also succeptible to this kind
+        * of problems?
+        */
+       if (lock->l_granted_mode == LCK_NL) {
+               req->rq_delay_limit = 1;
+               return -ESTALE;
+       }
+
        ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
+       return 0;
 }
 
 static inline int ldlm_ast_fini(struct ptlrpc_request *req,
@@ -1028,6 +1073,21 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
                /* Do not resend after lock callback timeout */
                req->rq_delay_limit = ldlm_bl_timeout(lock);
                req->rq_resend_cb = ldlm_update_resend;
+
+               /* We remember the request here so we can kill the request
+                * if the lock cancel/conversion comes before AST reply.
+                * For simplicity we only do this for blocking AST because
+                * there could be only one, so we don't need to do
+                * the whole list + locking mess.
+                * It does not help that there's no lock-specific spinlocks
+                * in struct ldlm_lock.
+                * To safely use this stored RPC caller pretty much has to
+                * use atomic exchange with NULL to avoid situation of two
+                * threads working on the same RPC.
+                */
+               LASSERT(!lock->l_sent_bl_ast);
+               ptlrpc_request_addref(req);
+               lock->l_sent_bl_ast = req;
        }
 
        req->rq_send_state = LUSTRE_IMP_FULL;
@@ -1669,6 +1729,7 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
        struct obd_export *exp = req->rq_export;
        struct ldlm_reply *dlm_rep;
        struct ldlm_lock *lock;
+       struct ptlrpc_request *ast_req;
        __u64 bits;
        __u64 new_bits;
        int rc;
@@ -1711,6 +1772,8 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
                GOTO(out_put, rc = -EPROTO);
        }
 
+       /* Grab AST RPC for later abort */
+       ast_req = xchg_acquire(&lock->l_sent_bl_ast, NULL);
        if (bits == new_bits) {
                /*
                 * This can be valid situation if CONVERT RPCs are
@@ -1735,6 +1798,14 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
                ldlm_reprocess_all(lock->l_resource, bits);
        }
 
+       /* See if there's an oustanding AST RPC for this lock
+        * and if yes - kill it.
+        */
+       if (ast_req) {
+               ptlrpc_req_abort(ast_req);
+               ptlrpc_req_finished(ast_req);
+       }
+
        dlm_rep->lock_handle = lock->l_remote_handle;
        ldlm_ibits_policy_local_to_wire(&lock->l_policy_data,
                                        &dlm_rep->lock_desc.l_policy_data);
@@ -1761,6 +1832,7 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
        struct ldlm_lock *lock;
        int i, count, done = 0;
        unsigned int size;
+       struct ptlrpc_request *ast_req;
 
        ENTRY;
 
@@ -1835,7 +1907,19 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
                                       &lock->l_export->exp_bl_lock_at,
                                       delay);
                }
+
+               /* We grab our reference early to combat races */
+               ast_req = xchg_acquire(&lock->l_sent_bl_ast, NULL);
+
                ldlm_lock_cancel(lock);
+               /* See if there's an oustanding AST RPC for this lock
+                * and if yes - kill it.
+                */
+               if (ast_req) {
+                       ptlrpc_req_abort(ast_req);
+                       ptlrpc_req_finished(ast_req);
+               }
+
                LDLM_LOCK_PUT(lock);
        }
        if (pres != NULL) {
index a1bf146..75841df 100644 (file)
@@ -3474,6 +3474,20 @@ void ptlrpc_abort_set(struct ptlrpc_request_set *set)
        }
 }
 
+void ptlrpc_req_abort(struct ptlrpc_request *req)
+{
+       spin_lock(&req->rq_lock);
+       req->rq_err = 1;
+       req->rq_status = -ECANCELED;
+
+       if (req->rq_phase != RQ_PHASE_RPC) {
+               spin_unlock(&req->rq_lock);
+               return;
+       }
+       ptlrpc_client_wake_req(req);
+       spin_unlock(&req->rq_lock);
+}
+
 /**
  * Initialize the XID for the node.  This is common among all requests on
  * this node, and only requires the property that it is monotonically
index e00bcb4..1134e61 100644 (file)
@@ -817,8 +817,12 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
        }
        if (request->rq_resend) {
                lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
-               if (request->rq_resend_cb != NULL)
-                       request->rq_resend_cb(request, &request->rq_async_args);
+               if (request->rq_resend_cb != NULL) {
+                       rc = request->rq_resend_cb(request,
+                                                  &request->rq_async_args);
+                       if (rc)
+                               return rc;
+               }
        }
        if (request->rq_memalloc)
                mpflag = memalloc_noreclaim_save();