From b92631b0a9029cf64661df8f27e9a50d8a6baf1a Mon Sep 17 00:00:00 2001 From: Vitaly Fertman Date: Tue, 13 Jul 2021 19:07:14 +0300 Subject: [PATCH] LU-14847 ptlrpc: two replay lock threads conflict to each other what leads to: ASSERTION( atomic_read(&imp->imp_replay_inflight) == 1 ) replay_lock_interpret() does ptlrpc_connect_import() on error, and one thread will appear starting with connect reply interpret. replay_lock_interpret() also wakes up ldlm_lock_replay_thread() which does ptlrpc_import_recovery_state_machine(). It may happen that both threads will get to ldlm_replay_locks() on the next round at the same time, both increment imp_replay_inflight and the second one will assert. The problem appeared in LU-13600 which added ldlm_lock_replay_thread() with the ptlrpc_import_recovery_state_machine() call. Lustre-change: https://review.whamcloud.com/44294 Lustre-commit: d7d7eb50c8f5fd3fc5a7808fb112d233bdef34d7 HPE-bug-id: LUS-10147 Fixes: 3b613a442b ("LU-13600 ptlrpc: limit rate of lock replays") Signed-off-by: Vitaly Fertman Signed-off-by: Xing Huang Change-Id: Ia9aafb631e3ba5f850504cc58b4826acec2813bd Reviewed-by: Andreas Dilger Reviewed-by: Mike Pershin Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/55249 Tested-by: jenkins Tested-by: Maloo --- lustre/ldlm/ldlm_request.c | 10 +++++++--- lustre/obdclass/obd_config.c | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 09ef72b..7cd6cf8 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -2616,7 +2616,8 @@ static int __ldlm_replay_locks(struct obd_import *imp, bool rate_limit) ENTRY; - LASSERT(atomic_read(&imp->imp_replay_inflight) == 1); + while (atomic_read(&imp->imp_replay_inflight) != 1) + cond_resched(); /* don't replay locks if import failed recovery */ if (imp->imp_vbr_failed) @@ -2672,9 +2673,12 @@ int ldlm_replay_locks(struct obd_import *imp) struct task_struct *task; int rc = 0; - class_import_get(imp); /* ensure this doesn't fall to 0 before all have been queued */ - atomic_inc(&imp->imp_replay_inflight); + if (atomic_inc_return(&imp->imp_replay_inflight) > 1) { + atomic_dec(&imp->imp_replay_inflight); + return 0; + } + class_import_get(imp); task = kthread_run(ldlm_lock_replay_thread, imp, "ldlm_lock_replay"); if (IS_ERR(task)) { diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 2fb0c2b..b0c93b9 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -915,8 +915,8 @@ struct obd_device *class_incref(struct obd_device *obd, { lu_ref_add_atomic(&obd->obd_reference, scope, source); atomic_inc(&obd->obd_refcount); - CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd, - atomic_read(&obd->obd_refcount)); + CDEBUG(D_INFO, "incref %s (%p) now %d - %s\n", obd->obd_name, obd, + atomic_read(&obd->obd_refcount), scope); return obd; } -- 1.8.3.1