Whamcloud - gitweb
LU-14847 ptlrpc: two replay lock threads
authorVitaly Fertman <c17818@cray.com>
Tue, 13 Jul 2021 16:07:14 +0000 (19:07 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Thu, 6 Jun 2024 08:15:32 +0000 (08:15 +0000)
conflict to each other what leads to:
        ASSERTION( atomic_read(&imp->imp_replay_inflight) == 1 )

replay_lock_interpret() does ptlrpc_connect_import() on error, and one
thread will appear starting with connect reply interpret.

replay_lock_interpret() also wakes up ldlm_lock_replay_thread() which
does ptlrpc_import_recovery_state_machine().

It may happen that both threads will get to ldlm_replay_locks() on the
next round at the same time, both increment imp_replay_inflight and
the second one will assert.

The problem appeared in LU-13600 which added ldlm_lock_replay_thread()
with the ptlrpc_import_recovery_state_machine() call.

Lustre-change: https://review.whamcloud.com/44294
Lustre-commit: d7d7eb50c8f5fd3fc5a7808fb112d233bdef34d7

HPE-bug-id: LUS-10147
Fixes: 3b613a442b ("LU-13600 ptlrpc: limit rate of lock replays")
Signed-off-by: Vitaly Fertman <vitaly.fertman@hpe.com>
Signed-off-by: Xing Huang <hxing@ddn.com>
Change-Id: Ia9aafb631e3ba5f850504cc58b4826acec2813bd
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Mike Pershin <mpershin@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/55249
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/ldlm/ldlm_request.c
lustre/obdclass/obd_config.c

index 09ef72b..7cd6cf8 100644 (file)
@@ -2616,7 +2616,8 @@ static int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit)
 
        ENTRY;
 
-       LASSERT(atomic_read(&imp->imp_replay_inflight) == 1);
+       while (atomic_read(&imp->imp_replay_inflight) != 1)
+               cond_resched();
 
        /* don't replay locks if import failed recovery */
        if (imp->imp_vbr_failed)
@@ -2672,9 +2673,12 @@ int ldlm_replay_locks(struct obd_import *imp)
        struct task_struct *task;
        int rc = 0;
 
-       class_import_get(imp);
        /* ensure this doesn't fall to 0 before all have been queued */
-       atomic_inc(&imp->imp_replay_inflight);
+       if (atomic_inc_return(&imp->imp_replay_inflight) > 1) {
+               atomic_dec(&imp->imp_replay_inflight);
+               return 0;
+       }
+       class_import_get(imp);
 
        task = kthread_run(ldlm_lock_replay_thread, imp, "ldlm_lock_replay");
        if (IS_ERR(task)) {
index 2fb0c2b..b0c93b9 100644 (file)
@@ -915,8 +915,8 @@ struct obd_device *class_incref(struct obd_device *obd,
 {
        lu_ref_add_atomic(&obd->obd_reference, scope, source);
        atomic_inc(&obd->obd_refcount);
-       CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
-              atomic_read(&obd->obd_refcount));
+       CDEBUG(D_INFO, "incref %s (%p) now %d - %s\n", obd->obd_name, obd,
+              atomic_read(&obd->obd_refcount), scope);
 
        return obd;
 }