From 054a7ff8655c9785749304586fbd9b8c7a8d5ba4 Mon Sep 17 00:00:00 2001 From: phil Date: Wed, 2 Mar 2005 23:16:57 +0000 Subject: [PATCH] b=5779 There's one error path in the DLM's enqueue code where, after a timeout and user abort, the client will keep its local copy of the lock. This has a few follow-on effects, all of which should be fixed by this patch: - the reference on the lock is never dropped, so we never try to cancel it, so we never find out that our view of the lock state differs from the server's. This could perhaps cause some corruption. - we try to match this lock on future enqueues; although the lock is marked as failed, search_queue is only checking for destroyed (bug). I don't know precisely why we need two flags for this, but that's a more subtle change than I'm willing to make right now. - once we have a handle on that lock, the completion AST does check that flag, so it returns an error right away -- but we don't check its return code in the match path (bug) and plow on - the lock enqueue was originally aborted before it got to the part that updates the KMS and sets the LDLM_FL_CAN_MATCH flag. So each match attempt will wait 100 seconds for that flag to get set, which of course never happens. We should print a pretty serious warning if that timeout happens, but fixes for the previous two bugs should prevent us from getting here in the first place. This has been running at NERSC for the last week, so I think it's ready for more exposure. --- lustre/ldlm/ldlm_lock.c | 16 +++++++++++----- lustre/ldlm/ldlm_request.c | 4 +--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 9f18891..be4bf58 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -588,7 +588,7 @@ static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t mode, lock->l_policy_data.l_extent.end < policy->l_extent.end)) continue; - if (lock->l_destroyed) + if (lock->l_destroyed || (lock->l_flags & LDLM_FL_FAILED)) continue; if ((flags & LDLM_FL_LOCAL_ONLY) && @@ -682,10 +682,15 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags, ldlm_lock2handle(lock, lockh); if (!(lock->l_flags & LDLM_FL_CAN_MATCH)) { struct l_wait_info lwi; - if (lock->l_completion_ast) - lock->l_completion_ast(lock, - LDLM_FL_WAIT_NOREPROC, - NULL); + if (lock->l_completion_ast) { + int err = lock->l_completion_ast(lock, + LDLM_FL_WAIT_NOREPROC, + NULL); + if (err) { + rc = 0; + goto out2; + } + } lwi = LWI_TIMEOUT_INTR(obd_timeout*HZ, NULL,NULL,NULL); @@ -694,6 +699,7 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags, (lock->l_flags & LDLM_FL_CAN_MATCH), &lwi); } } + out2: if (rc) { l_lock(&ns->ns_lock); LDLM_DEBUG(lock, "matched ("LPU64" "LPU64")", diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index eec23ba..afa8933 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -401,10 +401,8 @@ int ldlm_cli_enqueue(struct obd_export *exp, int err = lock->l_completion_ast(lock, *flags, NULL); if (!rc) rc = err; - if (lock->l_destroyed || - lock->l_flags & LDLM_FL_FAILED) + if (rc) cleanup_phase = 2; - } } -- 1.8.3.1