From 09ac97533f32edd51e879de9b96070a137cf5d26 Mon Sep 17 00:00:00 2001 From: adilger Date: Sun, 21 Mar 2004 06:15:51 +0000 Subject: [PATCH] Don't dereference freed lock when lock cancellation is racing with export eviction. Create a constant for freed memory to make it easier to check. Test 45 in replay-single is disabled because with or without the patch the forced client cleanup isn't working properly. b=2867 r=phil,robert --- lnet/include/linux/kp30.h | 3 +++ lustre/ChangeLog | 1 + lustre/include/linux/lustre_lib.h | 2 ++ lustre/ldlm/ldlm_lockd.c | 42 +++++++++++++++++-------------------- lustre/portals/include/linux/kp30.h | 3 +++ lustre/ptlrpc/client.c | 4 ++-- lustre/ptlrpc/events.c | 2 +- lustre/tests/recovery-small.sh | 25 ++++++++++++++++++++++ lustre/tests/replay-single.sh | 20 +++++++++++++++++- 9 files changed, 75 insertions(+), 27 deletions(-) diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index c080a57..4c3e85f 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -1115,18 +1115,21 @@ void kportal_put_ni (int nal); # define LPX64 "%#Lx" # define LPSZ "%lu" # define LPSSZ "%ld" +# define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPSZ "%u" # define LPSSZ "%d" +# define LP_POISON ((void *)0x5a5a5a5a) #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64) # define LPU64 "%lu" # define LPD64 "%ld" # define LPX64 "%#lx" # define LPSZ "%lu" # define LPSSZ "%ld" +# define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #endif #ifndef LPU64 # error "No word size defined" diff --git a/lustre/ChangeLog b/lustre/ChangeLog index f2477a7..9a2492e 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -20,6 +20,7 @@ tbd Cluster File Systems, Inc. - don't evict page beyond end of stripe extent (2925) - don't oops on a deleted current working directory (2399) - handle hard links to targets without a parent properly (2517) + - don't dereference NULL lock when racing during eviction (2867) 2004-03-04 Cluster File Systems, Inc. * version 1.2.0 diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index 24ad8fb..b0b907c 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -48,10 +48,12 @@ #define LPU64 "%lu" #define LPD64 "%ld" #define LPX64 "%#lx" +#define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #else #define LPU64 "%Lu" #define LPD64 "%Ld" #define LPX64 "%#Lx" +#define LP_POISON ((void *)0x5a5a5a5a) #endif #endif diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 6602713..c88139f 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -126,7 +126,6 @@ static int expired_lock_main(void *arg) wake_up(&expired_lock_thread.elt_waitq); while (1) { - struct list_head *tmp, *n, work_list; l_wait_event(expired_lock_thread.elt_waitq, have_expired_locks() || expired_lock_thread.elt_state == ELT_TERMINATE, @@ -134,33 +133,30 @@ static int expired_lock_main(void *arg) spin_lock_bh(&expired_lock_thread.elt_lock); while (!list_empty(expired)) { + struct obd_export *export; struct ldlm_lock *lock; - list_add(&work_list, expired); - list_del_init(expired); - - list_for_each_entry(lock, &work_list, l_pending_chain) { - LDLM_DEBUG(lock, "moving to work list"); - } - - spin_unlock_bh(&expired_lock_thread.elt_lock); - - - list_for_each_safe(tmp, n, &work_list) { - lock = list_entry(tmp, struct ldlm_lock, - l_pending_chain); - ptlrpc_fail_export(lock->l_export); + lock = list_entry(expired->next, struct ldlm_lock, + l_pending_chain); + if ((void *)lock < LP_POISON + PAGE_SIZE && + (void *)lock >= LP_POISON) { + CERROR("free lock on elt list %p\n", lock); + LBUG(); } - - - if (!list_empty(&work_list)) { - list_for_each_entry(lock, &work_list, l_pending_chain) { - LDLM_ERROR(lock, "still on work list!"); - } + list_del_init(&lock->l_pending_chain); + if ((void *)lock->l_export < LP_POISON + PAGE_SIZE && + (void *)lock->l_export >= LP_POISON + PAGE_SIZE) { + CERROR("lock with free export on elt list %p\n", + export); + lock->l_export = NULL; + LDLM_ERROR(lock, "free export\n"); + continue; } - LASSERTF (list_empty(&work_list), - "some exports not failed properly\n"); + export = class_export_get(lock->l_export); + spin_unlock_bh(&expired_lock_thread.elt_lock); + ptlrpc_fail_export(export); + class_export_put(export); spin_lock_bh(&expired_lock_thread.elt_lock); } spin_unlock_bh(&expired_lock_thread.elt_lock); diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h index c080a57..4c3e85f 100644 --- a/lustre/portals/include/linux/kp30.h +++ b/lustre/portals/include/linux/kp30.h @@ -1115,18 +1115,21 @@ void kportal_put_ni (int nal); # define LPX64 "%#Lx" # define LPSZ "%lu" # define LPSSZ "%ld" +# define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPSZ "%u" # define LPSSZ "%d" +# define LP_POISON ((void *)0x5a5a5a5a) #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64) # define LPU64 "%lu" # define LPD64 "%ld" # define LPX64 "%#lx" # define LPSZ "%lu" # define LPSSZ "%ld" +# define LP_POISON ((void *)0x5a5a5a5a5a5a5a5a) #endif #ifndef LPU64 # error "No word size defined" diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 8dc4ad0..70db906 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1070,8 +1070,8 @@ static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) if (request == NULL) RETURN(1); - if (request == (void *)(unsigned long)(0x5a5a5a5a5a5a5a5a) || - request->rq_reqmsg == (void *)(unsigned long)(0x5a5a5a5a5a5a5a5a)) { + if (request == LP_POISON || + request->rq_reqmsg == LP_POISON) { CERROR("dereferencing freed request (bug 575)\n"); LBUG(); RETURN(1); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index b1f8221..343ccba 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -324,7 +324,7 @@ static int ptlrpc_master_callback(ptl_event_t *ev) void (*callback)(ptl_event_t *ev) = cbid->cbid_fn; /* Honestly, it's best to find out early. */ - LASSERT (cbid->cbid_arg != (void *)0x5a5a5a5a5a5a5a5a); + LASSERT (cbid->cbid_arg != LP_POISON); LASSERT (callback == request_out_callback || callback == reply_in_callback || callback == client_bulk_callback || diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 5cd1e62..4b2c37f 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -199,4 +199,29 @@ test_15() { } run_test 15 "failed open (-ENOMEM)" +test_19a() { + f=$MOUNT/$tfile + do_facet client mcreate $f || return 1 + drop_ldlm_cancel "chmod 0777 $f" || echo evicted + + do_facet client checkstat -v -p 0777 $f || echo evicted + do_facet client "munlink $f" +} +run_test 19a "test expired_lock_main on mds (2867)" + +test_19b() { + f=$MOUNT/$tfile + do_facet client multiop $f Ow || return 1 + do_facet client multiop $f or || return 2 + + cancel_lru_locks OSC + + do_facet client multiop $f or || return 3 + drop_ldlm_cancel multiop $f Ow || echo "client evicted, as expected" + + do_facet client munlink $f || return 4 + +} +run_test 19b "test expired_lock_main on ost (2867)" + $CLEANUP diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 8b1c6e3..13a3465 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -14,7 +14,7 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/local.sh} # Skip these tests -ALWAYS_EXCEPT="" +ALWAYS_EXCEPT="45" gen_config() { @@ -864,6 +864,24 @@ test_44() { } run_test 44 "race in target handle connect" +test_45() { + d=$MOUNT/$tdir + + mkdir $d + ls -l $d + cp /etc/termcap $d + cat $d/termcap > /dev/null + + zconf_umount client $MOUNT -f + zconf_mount `hostname` $MOUNT + + rm -rf $d || return 1 + $CHECKSTAT -t dir $d && return 1 || true + + sleep 10 +} +run_test 45 "test client eviction after client crash" + equals_msg test complete, cleaning up $CLEANUP -- 1.8.3.1