From c7e34b11887dc6bdf9b294925462469519ddf1a1 Mon Sep 17 00:00:00 2001 From: adilger Date: Wed, 24 Mar 2004 02:50:22 +0000 Subject: [PATCH] Don't copy lvb into reply message on error, since that message might not have allocated enough space for the lvb and the client will ignore it anyways. Also fixes a NULL request deref in mds_mfd_close() when cleaning up opens on the MDS. The initial sanity test (66b) triggered a new bug 2986, but another test with multiop tests this properly. b=2983 --- lustre/ChangeLog | 1 + lustre/ldlm/ldlm_lock.c | 1 + lustre/ldlm/ldlm_lockd.c | 7 ++++- lustre/mds/mds_open.c | 20 +++++++------- lustre/ptlrpc/import.c | 69 ++++++++++++++++++++++-------------------------- lustre/tests/sanity.sh | 46 ++++++++++++++++++++++++++------ lustre/tests/sanityN.sh | 2 +- 7 files changed, 90 insertions(+), 56 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 753aa40..10b396e 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -2,6 +2,7 @@ tbd Cluster File Systems, Inc. * version 1.2.x * Bug fixes - clear page cache after eviction (2766) + - don't copy lvb into (possibly NULL) reply on error (2983) 2004-03-22 Cluster File Systems, Inc. * version 1.2.1 diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 1b5b3fb..756dd1c 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -187,6 +187,7 @@ void ldlm_lock_destroy(struct ldlm_lock *lock) } if (!list_empty(&lock->l_res_link)) { + LDLM_ERROR(lock, "lock still on resource"); ldlm_lock_dump(D_ERROR, lock, 0); LBUG(); } diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 5fc764e..a018807 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -620,6 +620,9 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, buffers = 2; } + if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR)) + GOTO(out, rc = -ENOMEM); + rc = lustre_pack_reply(req, buffers, size, NULL); if (rc) GOTO(out, rc); @@ -659,6 +662,7 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, err = lustre_pack_reply(req, 0, NULL, NULL); if (rc == 0) rc = err; + req->rq_status = rc; } /* The LOCK_CHANGED code in ldlm_lock_enqueue depends on this @@ -667,9 +671,10 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, LDLM_DEBUG(lock, "server-side enqueue handler, sending reply" "(err=%d, rc=%d)", err, rc); - if (lock->l_resource->lr_lvb_len > 0) { + if (lock->l_resource->lr_lvb_len > 0 && rc == 0) { void *lvb = lustre_msg_buf(req->rq_repmsg, 1, lock->l_resource->lr_lvb_len); + LASSERT(lvb != NULL); memcpy(lvb, lock->l_resource->lr_lvb_data, lock->l_resource->lr_lvb_len); } diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index f013e9d..0ed7c66 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -1038,7 +1038,6 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, void *handle = NULL; struct mds_body *request_body = NULL, *reply_body = NULL; struct dentry_params dp; - struct lov_mds_md *lmm; ENTRY; if (req != NULL) { @@ -1062,6 +1061,8 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, } if (last_orphan && unlink_orphan) { + int stripe_count = 0; + struct lov_mds_md *lmm = NULL; LASSERT(rc == 0); /* mds_put_write_access must have succeeded */ CDEBUG(D_HA, "destroying orphan object %s\n", fidname); @@ -1079,20 +1080,21 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, LASSERT(pending_child->d_inode != NULL); cleanup_phase = 2; /* dput(pending_child) when finished */ - lmm = lustre_msg_buf(req->rq_repmsg, 1, 0); - handle = fsfilt_start_log(obd, pending_dir, - FSFILT_OP_UNLINK, NULL, - le32_to_cpu(lmm->lmm_stripe_count)); + if (req != NULL) { + lmm = lustre_msg_buf(req->rq_repmsg, 1, 0); + stripe_count = le32_to_cpu(lmm->lmm_stripe_count); + } + + handle = fsfilt_start_log(obd, pending_dir, FSFILT_OP_UNLINK, + NULL, stripe_count); if (IS_ERR(handle)) { rc = PTR_ERR(handle); handle = NULL; GOTO(cleanup, rc); } - if (req != NULL && - (reply_body->valid & OBD_MD_FLEASIZE) && - mds_log_op_unlink(obd, pending_child->d_inode, - lustre_msg_buf(req->rq_repmsg, 1, 0), + if (req != NULL && (reply_body->valid & OBD_MD_FLEASIZE) && + mds_log_op_unlink(obd, pending_child->d_inode, lmm, req->rq_repmsg->buflens[1], lustre_msg_buf(req->rq_repmsg, 2, 0), req->rq_repmsg->buflens[2]) > 0) { diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 374e46e..7ed3c17 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -78,7 +78,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp); int ptlrpc_init_import(struct obd_import *imp) { unsigned long flags; - + spin_lock_irqsave(&imp->imp_lock, flags); imp->imp_generation++; @@ -96,18 +96,18 @@ int ptlrpc_set_import_discon(struct obd_import *imp) { unsigned long flags; int rc = 0; - + spin_lock_irqsave(&imp->imp_lock, flags); if (imp->imp_state == LUSTRE_IMP_FULL) { IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); - spin_unlock_irqrestore(&imp->imp_lock, flags); + spin_unlock_irqrestore(&imp->imp_lock, flags); obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); rc = 1; } else { spin_unlock_irqrestore(&imp->imp_lock, flags); CDEBUG(D_HA, "%p %s: import already not connected: %s\n", - imp,imp->imp_client->cli_name, + imp,imp->imp_client->cli_name, ptlrpc_import_state_name(imp->imp_state)); } @@ -153,24 +153,23 @@ void ptlrpc_invalidate_import(struct obd_import *imp, int in_rpc) if (!imp->imp_invalid) ptlrpc_deactivate_import(imp); - + LASSERT(imp->imp_invalid); if (in_rpc) inflight = 1; - /* wait for all requests to error out and call completion - callbacks */ - lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), NULL, + /* wait for all requests to error out and call completion callbacks */ + lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), NULL, NULL, NULL); - rc = l_wait_event(imp->imp_recovery_waitq, - (atomic_read(&imp->imp_inflight) == inflight), + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inflight) == inflight), &lwi); - + if (rc) CERROR("%s: rc = %d waiting for callback (%d != %d)\n", imp->imp_target_uuid.uuid, rc, atomic_read(&imp->imp_inflight), inflight); - + obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); } @@ -203,16 +202,15 @@ void ptlrpc_fail_import(struct obd_import *imp, int generation) imp->imp_obd->obd_name); ptlrpc_deactivate_import(imp); } - - CDEBUG(D_HA, "%s: waking up pinger\n", + + CDEBUG(D_HA, "%s: waking up pinger\n", imp->imp_target_uuid.uuid); - + spin_lock_irqsave(&imp->imp_lock, flags); imp->imp_force_verify = 1; spin_unlock_irqrestore(&imp->imp_lock, flags); - + ptlrpc_pinger_wake_up(); - } EXIT; } @@ -250,7 +248,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING); - imp->imp_conn_cnt++; + imp->imp_conn_cnt++; imp->imp_last_replay_transno = 0; if (imp->imp_remote_handle.cookie == 0) { @@ -288,7 +286,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) imp->imp_connection = conn; dlmexp = class_conn2export(&imp->imp_dlm_handle); - + LASSERT(dlmexp != NULL); if (dlmexp->exp_connection) @@ -339,7 +337,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, unsigned long flags; int msg_flags; ENTRY; - + spin_lock_irqsave(&imp->imp_lock, flags); if (imp->imp_state == LUSTRE_IMP_CLOSED) { spin_unlock_irqrestore(&imp->imp_lock, flags); @@ -388,7 +386,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, imp->imp_remote_handle = request->rq_repmsg->handle; } else { CERROR("reconnected to %s@%s after partition\n", - imp->imp_target_uuid.uuid, + imp->imp_target_uuid.uuid, imp->imp_connection->c_remote_uuid.uuid); } @@ -396,20 +394,17 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); else IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); - } - else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) { + } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) { LASSERT(imp->imp_replayable); imp->imp_remote_handle = request->rq_repmsg->handle; IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); - } - else { + } else { imp->imp_remote_handle = request->rq_repmsg->handle; IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); } - + /* Sanity checks for a reconnected import. */ - if (!(imp->imp_replayable) != - !(msg_flags & MSG_CONNECT_REPLAYABLE)) { + if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) { CERROR("imp_replayable flag does not match server " "after reconnect. We should LBUG right here.\n"); } @@ -433,7 +428,7 @@ finish: imp->imp_connection->c_remote_uuid.uuid); ptlrpc_connect_import(imp, NULL); RETURN(0); - } + } } out: if (rc != 0) { @@ -473,7 +468,7 @@ static int signal_completed_replay(struct obd_import *imp) req->rq_replen = lustre_msg_size(0, NULL); req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT; req->rq_reqmsg->flags |= MSG_LAST_REPLAY; - req->rq_timeout *= 3; + req->rq_timeout *= 3; req->rq_interpret_reply = completed_replay_interpret; ptlrpcd_add_req(req); @@ -493,13 +488,13 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) ptlrpc_invalidate_import(imp, 1); IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); - } - + } + if (imp->imp_state == LUSTRE_IMP_REPLAY) { CDEBUG(D_HA, "replay requested by %s\n", imp->imp_target_uuid.uuid); rc = ptlrpc_replay_next(imp, &inflight); - if (inflight == 0 && + if (inflight == 0 && atomic_read(&imp->imp_replay_inflight) == 0) { IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); rc = ldlm_replay_locks(imp); @@ -535,7 +530,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) GOTO(out, rc); IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); ptlrpc_activate_import(imp); - } + } if (imp->imp_state == LUSTRE_IMP_FULL) { wake_up(&imp->imp_recovery_waitq); @@ -546,7 +541,7 @@ int ptlrpc_import_recovery_state_machine(struct obd_import *imp) RETURN(rc); } -static int back_to_sleep(void *unused) +static int back_to_sleep(void *unused) { return 0; } @@ -572,9 +567,9 @@ int ptlrpc_disconnect_import(struct obd_import *imp) if (ptlrpc_import_in_recovery(imp)) { struct l_wait_info lwi; - lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), back_to_sleep, + lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), back_to_sleep, NULL, NULL); - rc = l_wait_event(imp->imp_recovery_waitq, + rc = l_wait_event(imp->imp_recovery_waitq, !ptlrpc_import_in_recovery(imp), &lwi); } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index b5d59bc..64cad6b 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -7,8 +7,8 @@ set -e ONLY=${ONLY:-"$*"} -# bug number for skipped test: 2399 (temporarily until new kernels arrive) -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"48"} +# bug number for skipped test: 2986 +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"66b"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" @@ -1122,6 +1122,7 @@ test_34c() { run_test 34c "O_RDWR opening file-with-size works ==============" test_34d() { + [ ! -f $DIR/f34 ] && test_34a dd if=/dev/zero of=$DIR/f34 conv=notrunc bs=4k count=1 || error $CHECKSTAT -s $TEST_34_SIZE $DIR/f34 || error rm $DIR/f34 @@ -1240,14 +1241,19 @@ stop_kupdated() { } # ensure that all stripes have some grant before we test client-side cache -for i in `seq -f $DIR/f42-%g 1 $STRIPECOUNT`; do - dd if=/dev/zero of=$i bs=4k count=1 - rm $i -done +setup_test42() { + [ "$SETUP_TEST42" ] && return + for i in `seq -f $DIR/f42-%g 1 $STRIPECOUNT`; do + dd if=/dev/zero of=$i bs=4k count=1 + rm $i + done + SETUP_TEST42=DONE +} # Tests 42* verify that our behaviour is correct WRT caching, file closure, # file truncation, and file removal. test_42a() { + setup_test42 cancel_lru_locks OSC stop_kupdated sync; sleep 1; sync # just to be safe @@ -1262,6 +1268,7 @@ test_42a() { run_test 42a "ensure that we don't flush on close ==============" test_42b() { + setup_test42 cancel_lru_locks OSC stop_kupdated sync @@ -1474,7 +1481,7 @@ test_47() { } run_test 47 "Device nodes check ================================" -test_48a() { +test_48a() { # bug 2399 mkdir $DIR/d48a cd $DIR/d48a mv $DIR/d48a $DIR/d48.new || error "move directory failed" @@ -1490,7 +1497,7 @@ test_48a() { } run_test 48a "Access renamed working dir (should return errors)=" -test_48b() { +test_48b() { # bug 2399 mkdir $DIR/d48b cd $DIR/d48b rmdir $DIR/d48b || error "remove cwd $DIR/d48b failed" @@ -1828,6 +1835,29 @@ test_65() { } run_test 65 "Verify that the files are created using parent dir's stripe info" +test_66a() { # bug 2983 - ldlm_handle_enqueue cleanup + mkdir -p $DIR/d66 + multiop $DIR/d66/f66 O_wc & + MULTI_PID=$! + usleep 500 + cancel_lru_locks OSC +#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 + sysctl -w lustre.fail_loc=0x80000308 + kill -USR1 $MULTI_PID + wait $MULTI_PID && error "multiop didn't fail enqueue" || true +} +run_test 66a "ldlm_handle_enqueue error (should return error) ===" + +test_66b() { # bug 2986 - ldlm_handle_enqueue error during open + mkdir $DIR/d66 + touch $DIR/d66/f66 + cancel_lru_locks OSC +#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 + sysctl -w lustre.fail_loc=0x80000308 + dd if=/etc/hosts of=$DIR/d66/f66 && error "didn't fail enqueue" || true +} +run_test 66b "ldlm_handle_enqueue error (should return error) ===" + # on the LLNL clusters, runas will still pick up root's $TMP settings, # which will not be writable for the runas user, and then you get a CVS diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 756ffdb..f475dda 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -317,7 +317,7 @@ test_14b() { run_test 14b "truncate of file being executed should return -ETXTBSY" test_15() { # bug 974 - ENOSPC - env + echo $PATH sh oos2.sh $MOUNT1 $MOUNT2 } run_test 15 "test out-of-space with multiple writers ===========" -- 1.8.3.1