LU-17446 ldlm: Do not wait for BL AST RPC completion on cancel

author Oleg Drokin <green@whamcloud.com>

Fri, 19 Jan 2024 05:24:43 +0000 (00:24 -0500)

committer Andreas Dilger <adilger@whamcloud.com>

Sat, 27 Jan 2024 18:24:25 +0000 (18:24 +0000)
author Oleg Drokin <green@whamcloud.com>
Fri, 19 Jan 2024 05:24:43 +0000 (00:24 -0500)
committer Andreas Dilger <adilger@whamcloud.com>
Sat, 27 Jan 2024 18:24:25 +0000 (18:24 +0000)
diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h

index 82e7a55..0d86774 100644 (file)
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -1021,6 +1021,13 @@ struct ldlm_lock {
          * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock.
          */
         struct list_head        l_exp_list;
+
+       /**
+        * Used internally to trace in-flight blocking ASTs of this lock
+        * There's no locking, so to pick items from here have to use atomic
+        * exchange primitives
+        */
+       struct ptlrpc_request   *l_sent_bl_ast;
  };
  
  enum ldlm_match_flags {
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index 4b12182..bff5b7c 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -725,8 +725,7 @@ typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env,
                                      struct ptlrpc_request *req,
                                      void *arg, int rc);
  /** Type of request resend call-back */
-typedef void (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req,
-                                  void *arg);
+typedef int (*ptlrpc_resend_cb_t)(struct ptlrpc_request *req, void *arg);
  
  /**
   * Definition of request pool structure.
@@ -2094,6 +2093,7 @@ void ptlrpc_restart_req(struct ptlrpc_request *req);
  void ptlrpc_abort_inflight(struct obd_import *imp);
  void ptlrpc_cleanup_imp(struct obd_import *imp);
  void ptlrpc_abort_set(struct ptlrpc_request_set *set);
+void ptlrpc_req_abort(struct ptlrpc_request *req);
  
  struct ptlrpc_request_set *ptlrpc_prep_set(void);
  struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func,
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c

index 849e039..b86402e 100644 (file)
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -2832,7 +2832,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
  
          if (resource == NULL) {
                 libcfs_debug_msg(msgdata,
-                                "%pV ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+                                "%pV ns: \?\? lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
                                  &vaf,
                                  lock,
                                  lock->l_handle.h_cookie,
@@ -2844,7 +2844,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                  lock->l_remote_handle.cookie,
                                  exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
                                  lock->l_pid, lock->l_callback_timestamp,
-                                lock->l_lvb_type);
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                  va_end(args);
                  return;
          }
@@ -2852,7 +2852,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
         switch (resource->lr_type) {
         case LDLM_EXTENT:
                 libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
                                  &vaf,
                                  ldlm_lock_to_ns_name(lock), lock,
                                  lock->l_handle.h_cookie,
@@ -2871,12 +2871,12 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                  lock->l_remote_handle.cookie,
                                  exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
                                  lock->l_pid, lock->l_callback_timestamp,
-                                lock->l_lvb_type);
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                 break;
  
         case LDLM_FLOCK:
                 libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld\n",
                                  &vaf,
                                  ldlm_lock_to_ns_name(lock), lock,
                                  lock->l_handle.h_cookie,
@@ -2899,7 +2899,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
         case LDLM_IBITS:
                 if (!lock->l_remote_handle.cookie)
                         libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s flags: %#llx pid: %u initiator: MDT%d\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s flags: %#llx pid: %u initiator: MDT%04x, lvb_type: %d/%pK\n",
                                  &vaf,
                                  ldlm_lock_to_ns_name(lock),
                                  lock, lock->l_handle.h_cookie,
@@ -2913,10 +2913,11 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                  atomic_read(&resource->lr_refcount),
                                  ldlm_typename[resource->lr_type],
                                  lock->l_flags, lock->l_pid,
-                                lock->l_policy_data.l_inodebits.li_initiator_id);
+                                lock->l_policy_data.l_inodebits.li_initiator_id,
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                 else
                         libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx/%#llx rrc: %d type: %s gid %llu flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
                                  &vaf,
                                  ldlm_lock_to_ns_name(lock),
                                  lock, lock->l_handle.h_cookie,
@@ -2934,12 +2935,12 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                  lock->l_remote_handle.cookie,
                                  exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
                                  lock->l_pid, lock->l_callback_timestamp,
-                                lock->l_lvb_type);
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                 break;
  
         default:
                 libcfs_debug_msg(msgdata,
-                                "%pV ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d\n",
+                                "%pV ns: %s lock: %pK/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lld lvb_type: %d/%pK\n",
                                  &vaf,
                                  ldlm_lock_to_ns_name(lock),
                                  lock, lock->l_handle.h_cookie,
@@ -2954,7 +2955,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock,
                                  lock->l_remote_handle.cookie,
                                  exp ? refcount_read(&exp->exp_handle.h_ref) : -99,
                                  lock->l_pid, lock->l_callback_timestamp,
-                                lock->l_lvb_type);
+                                lock->l_lvb_type, lock->l_sent_bl_ast);
                 break;
         }
         va_end(args);
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c

index 8651412..0fd0300 100644 (file)
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -741,6 +741,10 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
  {
         struct lnet_processid *peer = &req->rq_import->imp_connection->c_peer;
  
+       /* This AST was cancelled by lock cancel/convert */
+       if (rc == -ECANCELED)
+               return rc;
+
         if (!req->rq_replied || (rc && rc != -EINVAL)) {
                 if (ldlm_is_cancel(lock)) {
                         LDLM_DEBUG(lock,
@@ -843,6 +847,36 @@ static int ldlm_cb_interpret(const struct lu_env *env,
                 }
                 break;
         case LDLM_BL_CALLBACK:
+               /* if nobody grabbed our linked request, decref to match
+                * acquisition.
+                * Three cases are possible here:
+                * - l_sent_bl_ast is equal to our request - we'll free it
+                * - it's equal to NULL - so cancel / convert thread grabbed it
+                * - it's not NULL, but not equal to req - this is a convert
+                *   race, where lock convert came, and by the time we got here,
+                *   the lock got another BL AST that got recorded here.
+                *
+                * This 3rd case is the most annoying handling wise. Once we
+                * acquired this request with atomic exchange there's no safe
+                * way to put it back witohut risking the other thread missing
+                * it so we have to release it here anyway and bring the
+                * original problem back - if the other AST won't be replied
+                * in time, the other blocked thread would wait for a long time.
+                *
+                * To minimize this window, we'll do an unsafe check first,
+                * it's ok to check sent_bl_ast for both NULL and non-req value
+                * because once those values were seen, they cannot magically
+                * return to the req value. Either way whatever we got we must
+                * free if not NULL.
+                */
+               if (lock->l_sent_bl_ast != NULL && lock->l_sent_bl_ast == req) {
+                       struct ptlrpc_request *saved_req;
+
+                       saved_req = xchg_acquire(&lock->l_sent_bl_ast, NULL);
+                       if (saved_req)
+                               ptlrpc_req_finished(saved_req);
+               }
+
                 if (rc != 0)
                         rc = ldlm_handle_ast_error(lock, req, rc, "blocking");
                 break;
@@ -865,12 +899,23 @@ static int ldlm_cb_interpret(const struct lu_env *env,
         RETURN(0);
  }
  
-static void ldlm_update_resend(struct ptlrpc_request *req, void *data)
+static int ldlm_update_resend(struct ptlrpc_request *req, void *data)
  {
         struct ldlm_cb_async_args *ca = data;
         struct ldlm_lock *lock = ca->ca_lock;
  
+       /* See if the lock was already canceled, if so, don't do any
+        * resends, I am afraid to use lock_is_granted because what if
+        * we've got a lock convert? Or is it also succeptible to this kind
+        * of problems?
+        */
+       if (lock->l_granted_mode == LCK_NL) {
+               req->rq_delay_limit = 1;
+               return -ESTALE;
+       }
+
         ldlm_refresh_waiting_lock(lock, ldlm_bl_timeout(lock));
+       return 0;
  }
  
  static inline int ldlm_ast_fini(struct ptlrpc_request *req,
@@ -1028,6 +1073,21 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock,
                 /* Do not resend after lock callback timeout */
                 req->rq_delay_limit = ldlm_bl_timeout(lock);
                 req->rq_resend_cb = ldlm_update_resend;
+
+               /* We remember the request here so we can kill the request
+                * if the lock cancel/conversion comes before AST reply.
+                * For simplicity we only do this for blocking AST because
+                * there could be only one, so we don't need to do
+                * the whole list + locking mess.
+                * It does not help that there's no lock-specific spinlocks
+                * in struct ldlm_lock.
+                * To safely use this stored RPC caller pretty much has to
+                * use atomic exchange with NULL to avoid situation of two
+                * threads working on the same RPC.
+                */
+               LASSERT(!lock->l_sent_bl_ast);
+               ptlrpc_request_addref(req);
+               lock->l_sent_bl_ast = req;
         }
  
         req->rq_send_state = LUSTRE_IMP_FULL;
@@ -1669,6 +1729,7 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
         struct obd_export *exp = req->rq_export;
         struct ldlm_reply *dlm_rep;
         struct ldlm_lock *lock;
+       struct ptlrpc_request *ast_req;
         __u64 bits;
         __u64 new_bits;
         int rc;
@@ -1711,6 +1772,8 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
                 GOTO(out_put, rc = -EPROTO);
         }
  
+       /* Grab AST RPC for later abort */
+       ast_req = xchg_acquire(&lock->l_sent_bl_ast, NULL);
         if (bits == new_bits) {
                 /*
                  * This can be valid situation if CONVERT RPCs are
@@ -1735,6 +1798,14 @@ int ldlm_handle_convert0(struct ptlrpc_request *req,
                 ldlm_reprocess_all(lock->l_resource, bits);
         }
  
+       /* See if there's an oustanding AST RPC for this lock
+        * and if yes - kill it.
+        */
+       if (ast_req) {
+               ptlrpc_req_abort(ast_req);
+               ptlrpc_req_finished(ast_req);
+       }
+
         dlm_rep->lock_handle = lock->l_remote_handle;
         ldlm_ibits_policy_local_to_wire(&lock->l_policy_data,
                                         &dlm_rep->lock_desc.l_policy_data);
@@ -1761,6 +1832,7 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
         struct ldlm_lock *lock;
         int i, count, done = 0;
         unsigned int size;
+       struct ptlrpc_request *ast_req;
  
         ENTRY;
  
@@ -1835,7 +1907,19 @@ int ldlm_request_cancel(struct ptlrpc_request *req,
                                        &lock->l_export->exp_bl_lock_at,
                                        delay);
                 }
+
+               /* We grab our reference early to combat races */
+               ast_req = xchg_acquire(&lock->l_sent_bl_ast, NULL);
+
                 ldlm_lock_cancel(lock);
+               /* See if there's an oustanding AST RPC for this lock
+                * and if yes - kill it.
+                */
+               if (ast_req) {
+                       ptlrpc_req_abort(ast_req);
+                       ptlrpc_req_finished(ast_req);
+               }
+
                 LDLM_LOCK_PUT(lock);
         }
         if (pres != NULL) {
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c

index a1bf146..75841df 100644 (file)
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -3474,6 +3474,20 @@ void ptlrpc_abort_set(struct ptlrpc_request_set *set)
         }
  }
  
+void ptlrpc_req_abort(struct ptlrpc_request *req)
+{
+       spin_lock(&req->rq_lock);
+       req->rq_err = 1;
+       req->rq_status = -ECANCELED;
+
+       if (req->rq_phase != RQ_PHASE_RPC) {
+               spin_unlock(&req->rq_lock);
+               return;
+       }
+       ptlrpc_client_wake_req(req);
+       spin_unlock(&req->rq_lock);
+}
+
  /**
   * Initialize the XID for the node.  This is common among all requests on
   * this node, and only requires the property that it is monotonically
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c

index e00bcb4..1134e61 100644 (file)
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -817,8 +817,12 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply)
         }
         if (request->rq_resend) {
                 lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT);
-               if (request->rq_resend_cb != NULL)
-                       request->rq_resend_cb(request, &request->rq_async_args);
+               if (request->rq_resend_cb != NULL) {
+                       rc = request->rq_resend_cb(request,
+                                                  &request->rq_async_args);
+                       if (rc)
+                               return rc;
+               }
         }
         if (request->rq_memalloc)
                 mpflag = memalloc_noreclaim_save();
author	Oleg Drokin <green@whamcloud.com>
	Fri, 19 Jan 2024 05:24:43 +0000 (00:24 -0500)
committer	Andreas Dilger <adilger@whamcloud.com>
	Sat, 27 Jan 2024 18:24:25 +0000 (18:24 +0000)
lustre/include/lustre_dlm.h		patch \| blob \| history
lustre/include/lustre_net.h		patch \| blob \| history
lustre/ldlm/ldlm_lock.c		patch \| blob \| history
lustre/ldlm/ldlm_lockd.c		patch \| blob \| history
lustre/ptlrpc/client.c		patch \| blob \| history
lustre/ptlrpc/niobuf.c		patch \| blob \| history