From d88d6cfc5ae37da2ce4c834543a4afd980e04c4e Mon Sep 17 00:00:00 2001 From: jxiong Date: Thu, 26 Mar 2009 10:20:53 +0000 Subject: [PATCH] b=18829,18850 r=h.huang,yong.fan,shadow Fix some ldlm lock related issue for clio.. Still has more :-) --- lustre/include/obd_support.h | 3 ++- lustre/ldlm/ldlm_lockd.c | 5 +++-- lustre/ldlm/ldlm_request.c | 2 +- lustre/lov/lov_lock.c | 2 +- lustre/osc/osc_lock.c | 20 ++++++++++++++++---- lustre/osc/osc_request.c | 3 +++ lustre/tests/sanity.sh | 11 +++++++++++ 7 files changed, 37 insertions(+), 9 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 75bc6d2..9c70ef4 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -289,6 +289,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c #define OBD_FAIL_OSC_DIO_PAUSE 0x40d #define OBD_FAIL_OSC_OBJECT_CONTENTION 0x40e +#define OBD_FAIL_OSC_CP_CANCEL_RACE 0x40f #define OBD_FAIL_PTLRPC 0x500 #define OBD_FAIL_PTLRPC_ACK 0x501 @@ -716,7 +717,7 @@ do { \ }) #define OBD_SLAB_ALLOC(ptr, slab, type, size) \ do { \ - LASSERT(!in_interrupt()); \ + LASSERT(ergo(type != CFS_ALLOC_ATOMIC, !in_interrupt())); \ (ptr) = cfs_mem_cache_alloc(slab, (type)); \ if (likely((ptr) != NULL && \ (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \ diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index ebe2807..f6fe4a7 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -319,6 +319,7 @@ repeat: cont = 0; LDLM_LOCK_GET(lock); + spin_unlock_bh(&waiting_locks_spinlock); LDLM_DEBUG(lock, "prolong the busy lock"); ldlm_refresh_waiting_lock(lock, @@ -326,11 +327,11 @@ repeat: spin_lock_bh(&waiting_locks_spinlock); if (!cont) { - LDLM_LOCK_PUT(lock); + LDLM_LOCK_RELEASE(lock); break; } - LDLM_LOCK_PUT(lock); + LDLM_LOCK_RELEASE(lock); continue; } lock->l_resource->lr_namespace->ns_timeouts++; diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index b6fb1d1..d120548 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -1244,7 +1244,7 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) rc = ldlm_cli_cancel_local(lock); if (rc < 0 || rc == LDLM_FL_LOCAL_ONLY) { - LDLM_LOCK_RELEASE(lock); + LDLM_LOCK_PUT(lock); RETURN(rc < 0 ? rc : 0); } /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL diff --git a/lustre/lov/lov_lock.c b/lustre/lov/lov_lock.c index 9c741f8..67700dd 100644 --- a/lustre/lov/lov_lock.c +++ b/lustre/lov/lov_lock.c @@ -1040,7 +1040,7 @@ static struct cl_lock_closure *lov_closure_get(const struct lu_env *env, struct cl_lock_closure *closure; closure = &lov_env_info(env)->lti_closure; - LINVRNT(list_empty(&closure->clc_list)); + LASSERT(list_empty(&closure->clc_list)); cl_lock_closure_init(env, closure, parent, 1); return closure; } diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c index ef82a95..6fcbcc5 100644 --- a/lustre/osc/osc_lock.c +++ b/lustre/osc/osc_lock.c @@ -786,9 +786,9 @@ static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock, ; else if (dlmlock->l_granted_mode != LCK_MINMODE) osc_lock_granted(env, olck, dlmlock, dlmrc); + unlock_res_and_lock(dlmlock); if (dlmrc != 0) cl_lock_error(env, lock, dlmrc); - unlock_res_and_lock(dlmlock); cl_lock_mutex_put(env, lock); osc_ast_data_put(env, olck); result = 0; @@ -1394,19 +1394,31 @@ static void osc_lock_cancel(const struct lu_env *env, struct cl_lock *lock = slice->cls_lock; struct osc_lock *olck = cl2osc_lock(slice); struct ldlm_lock *dlmlock = olck->ols_lock; - int result; + int result = 0; int discard; LASSERT(cl_lock_is_mutexed(lock)); LINVRNT(osc_lock_invariant(olck)); if (dlmlock != NULL) { + int do_cancel; + discard = dlmlock->l_flags & LDLM_FL_DISCARD_DATA; result = osc_lock_flush(olck, discard); if (olck->ols_hold) osc_lock_unuse(env, slice); - LASSERT(dlmlock->l_readers == 0 && dlmlock->l_writers == 0); - result = ldlm_cli_cancel(&olck->ols_handle); + + lock_res_and_lock(dlmlock); + /* Now that we're the only user of dlm read/write reference, + * mostly the ->l_readers + ->l_writers should be zero. + * However, there is a corner case. + * See bug 18829 for details.*/ + do_cancel = (dlmlock->l_readers == 0 && + dlmlock->l_writers == 0); + dlmlock->l_flags |= LDLM_FL_CBPENDING; + unlock_res_and_lock(dlmlock); + if (do_cancel) + result = ldlm_cli_cancel(&olck->ols_handle); if (result < 0) CL_LOCK_DEBUG(D_ERROR, env, lock, "lock %p cancel failure with error(%d)\n", diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 6c9fcab..9e34a2c 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -3064,6 +3064,9 @@ static int osc_enqueue_interpret(const struct lu_env *env, /* Complete osc stuff. */ rc = osc_enqueue_fini(req, aa->oa_lvb, aa->oa_upcall, aa->oa_cookie, aa->oa_flags, rc); + + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10); + /* Release the lock for async request. */ if (lustre_handle_is_used(&handle) && rc == ELDLM_OK) /* diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index d5611db..a32da55 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -6372,6 +6372,17 @@ test_212() { } run_test 212 "Sendfile test ============================================" +test_213() { + dd if=/dev/zero of=$DIR/$tfile bs=4k count=4 + cancel_lru_locks osc + lctl set_param fail_loc=0x8000040f + # generate a read lock + cat $DIR/$tfile > /dev/null + # write to the file, it will try to cancel the above read lock. + cat /etc/hosts >> $DIR/$tfile +} +run_test 213 "OSC lock completion and cancel race don't crash - bug 18829" + # # tests that do cleanup/setup should be run at the end # -- 1.8.3.1