From c620f4d7d0c43e1ca3dc98b3d9a041bc13ac71d2 Mon Sep 17 00:00:00 2001 From: adilger Date: Thu, 28 Apr 2005 00:03:30 +0000 Subject: [PATCH] Branch: b1_4 Don't leave lock on resource list in ldlm_handle_enqueue() error path, as that will LBUG when lock is destroyed. Regression test for same. b=6149, b=6184 r=phil (original patch) --- lustre/ChangeLog | 1 + lustre/include/linux/obd_support.h | 1 + lustre/ldlm/ldlm_lockd.c | 26 +++++++++++++++++++------- lustre/tests/sanity.sh | 20 ++++++++++++++++---- 4 files changed, 37 insertions(+), 11 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 699612a..043ab0a 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -25,6 +25,7 @@ tbd Cluster File Systems, Inc. mountpoints (5907) - Avoid lock ordering deadlock issue with write/truncate (6203,5654) - reserve enough journal credits in fsfilt_start_log for setattr (4554) + - ldlm_enqueue freed-export error path would always LBUG (6149,6184) * miscellania - by default create 1 inode per 4kB space on MDS, per 16kB on OSTs - allow --write-conf on an MDS with different nettype than client (5619) diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index c45aa38..c7604da 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -133,6 +133,7 @@ extern wait_queue_head_t obd_race_waitq; #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b #define OBD_FAIL_LDLM_REPLY 0x30c #define OBD_FAIL_LDLM_RECOV_CLIENTS 0x30d +#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e #define OBD_FAIL_OSC 0x400 #define OBD_FAIL_OSC_BRW_READ_BULK 0x401 diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 450e2cf..953c823 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -135,7 +135,8 @@ static int expired_lock_main(void *arg) /* from waiting_locks_callback, but not in timer */ portals_debug_dumplog(); - portals_run_lbug_upcall(__FILE__, "waiting_locks_cb", + portals_run_lbug_upcall(__FILE__, + "waiting_locks_callback", expired_lock_thread.elt_dump); spin_lock_bh(&waiting_locks_spinlock); @@ -150,8 +151,8 @@ static int expired_lock_main(void *arg) l_pending_chain); if ((void *)lock < LP_POISON + PAGE_SIZE && (void *)lock >= LP_POISON) { - CERROR("free lock on elt list %p\n", lock); spin_unlock_bh(&waiting_locks_spinlock); + CERROR("free lock on elt list %p\n", lock); LBUG(); } list_del_init(&lock->l_pending_chain); @@ -187,6 +188,9 @@ static void waiting_locks_callback(unsigned long unused) struct ldlm_lock *lock, *last = NULL; char str[PTL_NALFMT_SIZE]; + if (obd_dump_on_timeout) + portals_debug_dumplog(); + spin_lock_bh(&waiting_locks_spinlock); while (!list_empty(&waiting_locks_list)) { lock = list_entry(waiting_locks_list.next, struct ldlm_lock, @@ -366,6 +370,8 @@ static void ldlm_failed_ast(struct ldlm_lock *lock, int rc, " (%s)", ast_type, rc, lock->l_export->exp_client_uuid.uuid, conn->c_remote_uuid.uuid, conn->c_peer.peer_id.nid, str); + if (obd_dump_on_timeout) + portals_debug_dumplog(); ptlrpc_fail_export(lock->l_export); } @@ -391,6 +397,7 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock, ldlm_failed_ast(lock, rc, ast_type); } } else if (rc) { + l_lock(&lock->l_resource->lr_namespace->ns_lock); if (rc == -EINVAL) LDLM_DEBUG(lock, "client (nid %s) returned %d" " from %s AST - normal race", @@ -402,6 +409,7 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock, (req->rq_repmsg != NULL) ? req->rq_repmsg->status : 0, ast_type); ldlm_lock_cancel(lock); + l_unlock(&lock->l_resource->lr_namespace->ns_lock); /* Server-side AST functions are called from ldlm_reprocess_all, * which needs to be told to please restart its reprocessing. */ rc = -ERESTART; @@ -499,10 +507,11 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) LASSERT(lock != NULL); do_gettimeofday(&granted_time); - total_enqueue_wait = timeval_sub(&granted_time, &lock->l_enqueued_time); + total_enqueue_wait = timeval_sub(&granted_time,&lock->l_enqueued_time); if (total_enqueue_wait / 1000000 > obd_timeout) - LDLM_ERROR(lock, "enqueue wait took %luus", total_enqueue_wait); + LDLM_ERROR(lock, "enqueue wait took %luus from %lu", + total_enqueue_wait, lock->l_enqueued_time.tv_sec); down(&lock->l_resource->lr_lvb_sem); if (lock->l_resource->lr_lvb_len) { @@ -647,7 +656,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, LASSERT(req->rq_export); if (flags & LDLM_FL_REPLAY) { - lock = find_existing_lock(req->rq_export, + lock = find_existing_lock(req->rq_export, &dlm_req->lock_handle1); if (lock != NULL) { DEBUG_REQ(D_HA, req, "found existing lock cookie "LPX64, @@ -746,7 +755,8 @@ existing_lock: l_lock(&lock->l_resource->lr_namespace->ns_lock); /* Don't move a pending lock onto the export if it has already * been evicted. Cancel it now instead. (bug 5683) */ - if (req->rq_export->exp_failed) { + if (req->rq_export->exp_failed || + OBD_FAIL_CHECK_ONCE(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT)) { LDLM_ERROR(lock, "lock on destroyed export %p", req->rq_export); rc = -ENOTCONN; } else if (lock->l_flags & LDLM_FL_AST_SENT) { @@ -788,6 +798,7 @@ existing_lock: } up(&lock->l_resource->lr_lvb_sem); } else { + ldlm_resource_unlink_lock(lock); ldlm_lock_destroy(lock); } @@ -839,7 +850,8 @@ int ldlm_handle_convert(struct ptlrpc_request *req) } if (lock) { - ldlm_reprocess_all(lock->l_resource); + if (!req->rq_status) + ldlm_reprocess_all(lock->l_resource); l_lock(&lock->l_resource->lr_namespace->ns_lock); LDLM_DEBUG(lock, "server-side convert handler END"); l_unlock(&lock->l_resource->lr_namespace->ns_lock); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 915860e..056045b 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1998,8 +1998,7 @@ test_56() { "lfs find --recursive $DIR/d56 wrong: found $FILENUM, expected $NUMFILESx2" FILENUM=`$LFIND $DIR/d56 | grep -c obdidx` [ $FILENUM -eq $NUMFILES ] || error \ - "lfs find $DIR/d56 without --recursive wrong: found $FILENUM, - expected $NUMFILES" + "lfs find $DIR/d56 without --recursive wrong: found $FILENUM, expected $NUMFILES" echo "lfs find --recursive passed." # test lfs find with file instead of dir @@ -2017,7 +2016,7 @@ test_56() { #test lfs find with --obd $LFIND --obd wrong_uuid $DIR/d56 2>&1 | grep -q "unknown obduuid" || \ - error "lfs find --obd wrong_uuid should return error information" + error "lfs find --obd wrong_uuid should return error message" [ "$OSTCOUNT" -lt 2 ] && \ echo "skipping other lfs find --obd test" && return @@ -2026,7 +2025,7 @@ test_56() { FOUND=`$LFIND -r --obd $OBDUUID $DIR/d56 | wc -l` [ $FOUND -eq $FILENUM ] || \ error "lfs find --obd wrong: found $FOUND, expected $FILENUM" - [ `$LFIND -r -v --obd $OBDUUID $DIR/d56 | sed '/^[ ]*1[ ]/d' | \ + [ `$LFIND -r -v --obd $OBDUUID $DIR/d56 | sed '/^[ ]*1[ ]/d' |\ sed -n '/^[ ]*[0-9][0-9]*[ ]/p' | wc -l` -eq 0 ] || \ error "lfs find --obd wrong: should not show file on other obd" echo "lfs find --obd passed." @@ -2395,6 +2394,19 @@ run_test 72 "Test that remove suid works properly (bug5695) ====" #b_cray run_test 73 "multiple MDC requests (should not deadlock)" +test_74() { # bug 6149, 6184 + #define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e + # + # very important to OR with OBD_FAIL_ONCE (0x80000000) -- otherwise it + # will spin in a tight reconnection loop + sysctl -w lustre.fail_loc=0x8000030e + # get any lock + touch $DIR/f74 + sysctl -w lustre.fail_loc=0 + true +} +run_test 74 "ldlm_enqueue freed-export error path (shouldn't LBUG)" + # on the LLNL clusters, runas will still pick up root's $TMP settings, # which will not be writable for the runas user, and then you get a CVS # error message with a corrupt path string (CVS bug) and panic. -- 1.8.3.1