From 4155844f40cb2ba58d794c1091af090d80eca102 Mon Sep 17 00:00:00 2001 From: phil Date: Wed, 3 Dec 2003 10:58:26 +0000 Subject: [PATCH] b=2322 In ldlm_process_{plain,extent}_lock, we used to remove and re-add the lock to the waiting list after every -ERESTART loop. But because of the logic in the ldlm_*_compat_queue functions, in a very rare case, this could lead to lock re-ordering and subsequent deadlock. --- lustre/ChangeLog | 1 + lustre/ldlm/ldlm_extent.c | 10 +++++++--- lustre/ldlm/ldlm_plain.c | 10 +++++++--- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 0126cbe..95d6dd9 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -16,6 +16,7 @@ tbd Cluster File Systems, Inc. - x86-64 compile warning fixes - fix gateway LMC keyword conflict (2318) - fix MDS lock inversions in getattr/reint paths (1844) + - fix a rare lock re-ordering bug, which caused deadlock (2322) * miscellania - allow configurable automake binary, for testing new versions - small update to the lfs documentation diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index 8e62496..246ed17c 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -190,9 +190,13 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq, if (rc != 2) { /* If either of the compat_queue()s returned 0, then we - * have ASTs to send and must go onto the waiting list. */ - ldlm_resource_unlink_lock(lock); - ldlm_resource_add_lock(res, &res->lr_waiting, lock); + * have ASTs to send and must go onto the waiting list. + * + * bug 2322: we used to unlink and re-add here, which was a + * terrible folly -- if we goto restart, we could get + * re-ordered! Causes deadlock, because ASTs aren't sent! */ + if (list_empty(&lock->l_res_link)) + ldlm_resource_add_lock(res, &res->lr_waiting, lock); l_unlock(&res->lr_namespace->ns_lock); rc = ldlm_run_ast_work(res->lr_namespace, &rpc_list); l_lock(&res->lr_namespace->ns_lock); diff --git a/lustre/ldlm/ldlm_plain.c b/lustre/ldlm/ldlm_plain.c index a9d7702..9b2af34 100644 --- a/lustre/ldlm/ldlm_plain.c +++ b/lustre/ldlm/ldlm_plain.c @@ -105,9 +105,13 @@ int ldlm_process_plain_lock(struct ldlm_lock *lock, int *flags, int first_enq, if (rc != 2) { /* If either of the compat_queue()s returned 0, then we - * have ASTs to send and must go onto the waiting list. */ - ldlm_resource_unlink_lock(lock); - ldlm_resource_add_lock(res, &res->lr_waiting, lock); + * have ASTs to send and must go onto the waiting list. + * + * bug 2322: we used to unlink and re-add here, which was a + * terrible folly -- if we goto restart, we could get + * re-ordered! Causes deadlock, because ASTs aren't sent! */ + if (list_empty(&lock->l_res_link)) + ldlm_resource_add_lock(res, &res->lr_waiting, lock); l_unlock(&res->lr_namespace->ns_lock); rc = ldlm_run_ast_work(res->lr_namespace, &rpc_list); l_lock(&res->lr_namespace->ns_lock); -- 1.8.3.1