From 4766605607c044abb953da582e9336e24982dcae Mon Sep 17 00:00:00 2001 From: shadow Date: Wed, 26 Mar 2008 06:22:07 +0000 Subject: [PATCH] fix panic with double free request if network error. b=15027 i=johann i=fanyong --- lustre/ChangeLog | 9 +++++++++ lustre/include/obd_support.h | 1 + lustre/mdc/mdc_lib.c | 4 ++-- lustre/mdc/mdc_locks.c | 21 ++++++++++++++------- lustre/tests/sanity.sh | 25 ++++++++++++++++++++++--- 5 files changed, 48 insertions(+), 12 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 6f06c32..7e0f3b8 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -18,6 +18,15 @@ tbd Sun Microsystems, Inc. * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a removed cwd "./" (refer to Bugzilla 14399). +Severity : major +Bugzilla : 15027 +Frequency : on network error +Description: panic with double free request if network error +Details : mdc_finish_enqueue is finish request if any network error occuring, + but it's true only for synchronus enqueue, for async enqueue + (via ptlrpcd) this incorrect and ptlrpcd want finish request + himself. + Severity : normal Bugzilla : 14533 Frequency : rare, on recovery diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index b42a71e..6dcb0d9 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -257,6 +257,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 #define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 +#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 #define OBD_FAIL_MGS 0x900 #define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c index c66de79..bb02005 100644 --- a/lustre/mdc/mdc_lib.c +++ b/lustre/mdc/mdc_lib.c @@ -443,7 +443,7 @@ void mdc_exit_request(struct client_obd *cli) spin_lock(&cli->cl_loi_list_lock); cli->cl_r_in_flight--; - + list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { /* No free request slots anymore */ @@ -456,6 +456,6 @@ void mdc_exit_request(struct client_obd *cli) wake_up(&mcw->mcw_waitq); } /* Empty waiting list? Decrease reqs in-flight number */ - + spin_unlock(&cli->cl_loi_list_lock); } diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 44af012..85c9580 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -406,6 +406,7 @@ static int mdc_finish_enqueue(struct obd_export *exp, struct ldlm_reply *lockrep; ENTRY; + LASSERT(rc >= 0); /* Similarly, if we're going to replay this request, we don't want to * actually get a lock, just perform the intent. */ if (req->rq_transno || req->rq_replay) { @@ -418,12 +419,6 @@ static int mdc_finish_enqueue(struct obd_export *exp, einfo->ei_mode = 0; memset(lockh, 0, sizeof(*lockh)); rc = 0; - } else if (rc != 0) { - CERROR("ldlm_cli_enqueue: %d\n", rc); - LASSERTF(rc < 0, "rc %d\n", rc); - mdc_clear_replay_flag(req, rc); - ptlrpc_req_finished(req); - RETURN(rc); } else { /* rc = 0 */ struct ldlm_lock *lock = ldlm_handle2lock(lockh); LASSERT(lock); @@ -587,7 +582,12 @@ int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, 0, NULL, lockh, 0); mdc_exit_request(&obddev->u.cli); mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it); - + if (rc < 0) { + CERROR("ldlm_cli_enqueue: %d\n", rc); + mdc_clear_replay_flag(req, rc); + ptlrpc_req_finished(req); + RETURN(rc); + } rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); RETURN(rc); @@ -832,9 +832,16 @@ static int mdc_intent_getattr_async_interpret(struct ptlrpc_request *req, obddev = class_exp2obd(exp); mdc_exit_request(&obddev->u.cli); + if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE)) + rc = -ETIMEDOUT; rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode, &flags, NULL, 0, NULL, lockh, rc); + if (rc < 0) { + CERROR("ldlm_cli_enqueue: %d\n", rc); + mdc_clear_replay_flag(req, rc); + GOTO(out, rc); + } rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); if (rc) diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 7517d15..838c19c 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -4635,8 +4635,7 @@ test_122() { #bug 11544 } run_test 122 "fail client bulk callback (shouldn't LBUG) =======" -test_123() # statahead(bug 11401) -{ +test_123a() { # was test 123, statahead(bug 11401) if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then log "testing on UP system. Performance may be not as good as expected." fi @@ -4657,6 +4656,7 @@ test_123() # statahead(bug 11401) etime=`date +%s` delta_sa=$((etime - stime)) log "ls $i files with statahead: $delta_sa sec" + lctl get_param -n llite.*.statahead_stats max=`lctl get_param -n llite.*.statahead_max | head -n 1` lctl set_param -n llite.*.statahead_max 0 @@ -4692,7 +4692,26 @@ test_123() # statahead(bug 11401) [ $error -ne 0 ] && error "statahead is slow!" return 0 } -run_test 123 "verify statahead work" +run_testa 123 "verify statahead work" + +test_123b () { # statahead(bug 15027) + mkdir -p $DIR/$tdir + createmany -o $DIR/$tdir/$tfile-%d 1000 + + cancel_lru_locks mdc + cancel_lru_locks osc + +#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 + sysctl -w lustre.fail_loc=0x80000803 + ls -lR $DIR/$tdir > /dev/null + log "ls done" + sysctl -w lustre.fail_loc=0x0 + lctl get_param -n llite.*.statahead_stats + rm -r $DIR/$tdir + sync + +} +run_test 123b "not panic with network error in statahead enqueue (bug 15027)" test_124a() { [ -z "`lctl get_param -n mdc.*.connect_flags | grep lru_resize`" ] && \ -- 1.8.3.1