From: liuy Date: Thu, 1 Nov 2007 05:16:15 +0000 (+0000) Subject: b=3462 X-Git-Tag: v1_8_0_110~1039 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=09ac92105f4e65485248fa6cb06823b95e1fec57;p=fs%2Flustre-release.git b=3462 i=johann, deen replay-single tests: |X| open and close req or reply while two MDC requests in flight --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index daf027a..50738b1 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -142,6 +142,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138 #define OBD_FAIL_MDS_OSC_PRECREATE 0x139 #define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a +#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 8cdf887..b325a50 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1674,6 +1674,7 @@ int mds_handle(struct ptlrpc_request *req) DEBUG_REQ(D_INODE, req, "close"); OBD_FAIL_RETURN(OBD_FAIL_MDS_CLOSE_NET, 0); rc = mds_close(req, REQ_REC_OFF); + fail = OBD_FAIL_MDS_CLOSE_NET_REP; break; case MDS_DONE_WRITING: diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 79e8e4d..2e378d3 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -1051,8 +1051,177 @@ test_52() { } run_test 52 "time out lock replay (3764)" -#b_cray 53 "|X| open request and close reply while two MDC requests in flight" -#b_cray 54 "|X| open request and close reply while two MDC requests in flight" +# bug 3462 - simultaneous MDC requests +test_53a() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + # give multiop a chance to open + sleep 1 + + #define OBD_FAIL_MDS_CLOSE_NET 0x115 + do_facet mds "sysctl -w lustre.fail_loc=0x80000115" + kill -USR1 $close_pid + cancel_lru_locks MDC # force the close + do_facet mds "sysctl -w lustre.fail_loc=0" + mcreate $DIR/${tdir}-2/f || return 1 + + # close should still be here + [ -d /proc/$close_pid ] || return 2 + replay_barrier_nodf mds + fail mds + wait $close_pid || return 3 + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 + rm -rf $DIR/${tdir}-* +} +run_test 53a "|X| close request while two MDC requests in flight" + +test_53b() { + mkdir -p $DIR/$tdir-1 + mkdir -p $DIR/$tdir-2 + multiop $DIR/$tdir-1/f O_c & + close_pid=$! + + #define OBD_FAIL_MDS_REINT_NET 0x107 + do_facet mds "sysctl -w lustre.fail_loc=0x80000107" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + do_facet mds "sysctl -w lustre.fail_loc=0" + kill -USR1 $close_pid + cancel_lru_locks MDC # force the close + wait $close_pid || return 1 + # open should still be here + [ -d /proc/$open_pid ] || return 2 + + replay_barrier_nodf mds + fail mds + wait $open_pid || return 3 + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 + rm -rf $DIR/${tdir}-* +} +run_test 53b "|X| open request while two MDC requests in flight" + +test_53c() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + + do_facet mds "sysctl -w lustre.fail_loc=0x80000107" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + do_facet mds "sysctl -w lustre.fail_loc=0x80000115" + kill -USR1 $close_pid + cancel_lru_locks MDC # force the close + + replay_barrier_nodf mds + fail_nodf mds + wait $open_pid || return 1 + sleep 2 + # close should be gone + [ -d /proc/$close_pid ] && return 2 + do_facet mds "sysctl -w lustre.fail_loc=0" + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 + rm -rf $DIR/${tdir}-* +} +run_test 53c "|X| open request and close request while two MDC requests in flight" + +test_53d() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + # give multiop a chance to open + sleep 1 + + # define OBD_FAIL_MDS_CLOSE_NET_REP 0X138 + do_facet mds "sysctl -w lustre.fail_loc=0x8000013b" + kill -USR1 $close_pid + cancel_lru_locks MDC # force the close + do_facet mds "sysctl -w lustre.fail_loc=0" + mcreate $DIR/${tdir}-2/f || return 1 + + # close should still be here + [ -d /proc/$close_pid ] || return 2 + replay_barrier_nodf mds + fail mds + wait $close_pid || return 3 + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 + rm -rf $DIR/${tdir}-* +} +run_test 53d "|X| close reply while two MDC requests in flight" + +test_53e() { + mkdir -p $DIR/$tdir-1 + mkdir -p $DIR/$tdir-2 + multiop $DIR/$tdir-1/f O_c & + close_pid=$! + + #define OBD_FAIL_MDS_REINT_NET_REP 0x119 + do_facet mds "sysctl -w lustre.fail_loc=0x80000119" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + do_facet mds "sysctl -w lustre.fail_loc=0" + kill -USR1 $close_pid + cancel_lru_locks MDC # force the close + wait $close_pid || return 1 + # open should still be here + [ -d /proc/$open_pid ] || return 2 + + replay_barrier_nodf mds + fail mds + wait $open_pid || return 3 + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5 + rm -rf $DIR/${tdir}-* +} +run_test 53e "|X| open reply while two MDC requests in flight" + +test_53h() { + mkdir -p $DIR/${tdir}-1 + mkdir -p $DIR/${tdir}-2 + multiop $DIR/${tdir}-1/f O_c & + close_pid=$! + + do_facet mds "sysctl -w lustre.fail_loc=0x80000107" + mcreate $DIR/${tdir}-2/f & + open_pid=$! + sleep 1 + + do_facet mds "sysctl -w lustre.fail_loc=0x8000013b" + kill -USR1 $close_pid + cancel_lru_locks MDC # force the close + sleep 1 + + replay_barrier_nodf mds + fail_nodf mds + wait $open_pid || return 1 + sleep 2 + # close should be gone + [ -d /proc/$close_pid ] && return 2 + do_facet mds "sysctl -w lustre.fail_loc=0" + + $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 + $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 + rm -rf $DIR/${tdir}-* +} +run_test 53h "|X| open request and close reply while two MDC requests in flight" #b3761 ASSERTION(hash != 0) failed test_55() { @@ -1357,7 +1526,6 @@ if [ -n "$ATOLDBASE" ]; then fi # end of AT tests includes above lines - equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 00dbe74..b8e5c1e 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -486,6 +486,11 @@ fail() { df $MOUNT || error "post-failover df: $?" } +fail_nodf() { + local facet=$1 + facet_failover $facet +} + fail_abort() { local facet=$1 stop $facet