X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Freplay-dual.sh;h=76eb25265009b0743ba578146117ade585b4eda5;hb=4ecae3cd5af60e389eba1e6eff2913b09f557203;hp=8848b78532d5a9155a8a1d52db7f53e6cf00a7e7;hpb=89f9a5bced24ecb7c84040a1ed88dcef4384f7c6;p=fs%2Flustre-release.git diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 8848b78..76eb252 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -9,6 +9,10 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/lmv.sh} +# Skip these tests +# 21 - open vs. unlink out of order replay: isn't solved yet +ALWAYS_EXCEPT="21" + SETUP=${SETUP:-"setup"} CLEANUP=${CLEANUP:-"cleanup"} @@ -27,7 +31,7 @@ gen_config() { add_mds mds1 --dev $MDSDEV --size $MDSSIZE add_lov lov1 mds1 --stripe_sz $STRIPE_BYTES \ --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 - MDS=mds1_svc + MDS=mds1 fi add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE --failover @@ -51,13 +55,22 @@ cleanup() { umount $MOUNT2 || true umount $MOUNT || true rmmod llite + + # b=3941 + # In mds recovery, the mds will clear orphans in ost by + # mds_lov_clear_orphan, which will sent the request to ost and waiting for + # the reply, if we stop mds at this time, we will got the obd_refcount > 1 + # errors, because mds_lov_clear_orphan grab a export of mds, + # so the obd_refcount of mds will not be zero. So, wait a while before + # stop mds. This bug needs further work. for mds in `mds_list`; do + sleep 5 stop $mds ${FORCE} $MDSLCONFARGS done - stop_lgssd - stop_lsvcgssd stop ost2 ${FORCE} stop ost ${FORCE} --dump cleanup-dual.log + stop_lgssd + stop_lsvcgssd } if [ "$ONLY" == "cleanup" ]; then @@ -70,6 +83,8 @@ setup() { gen_config start_krb5_kdc || exit 1 + start_lsvcgssd || exit 2 + start_lgssd || exit 3 start ost --reformat $OSTLCONFARGS PINGER=`cat /proc/fs/lustre/pinger` @@ -80,8 +95,6 @@ setup() { fi start ost2 --reformat $OSTLCONFARGS - start_lsvcgssd || exit 2 - start_lgssd || exit 3 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE for mds in `mds_list`; do start $mds --reformat $MDSLCONFARGS @@ -345,6 +358,7 @@ test_14() { facet_failover mds1 # expect failover to fail df $MOUNT && return 1 + sleep 1 # first 25 files shouuld have been # replayed @@ -364,6 +378,7 @@ test_15() { facet_failover mds1 df $MOUNT || return 1 + sleep 1 unlinkmany $MOUNT1/$tfile- 25 || return 2 @@ -381,6 +396,7 @@ test_16() { sleep $TIMEOUT facet_failover mds1 df $MOUNT || return 1 + sleep 1 unlinkmany $MOUNT1/$tfile- 25 || return 2 @@ -403,6 +419,7 @@ test_17() { sleep $TIMEOUT facet_failover ost df $MOUNT || return 1 + sleep 1 unlinkmany $MOUNT1/$tfile- 25 || return 2 @@ -431,7 +448,6 @@ test_18 () { } run_test 18 "replay open, Abort recovery, don't assert (3892)" - # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for # itexport NOW=0 @@ -439,7 +455,7 @@ test_20() { # bug 3822 - evicting client with enqueued lock mkdir -p $MOUNT1/$tdir touch $MOUNT1/$tdir/f0 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b - statmany -s $MOUNT1/$tdir/f 500 & + statmany -s $MOUNT1/$tdir/f 1 500 & OPENPID=$! NOW=`date +%s` do_facet mds1 sysctl -w lustre.fail_loc=0x8000030b # hold enqueue @@ -452,9 +468,83 @@ test_20() { # bug 3822 - evicting client with enqueued lock wait $OPENPID dmesg | grep "entering recovery in server" && \ error "client not evicted" || true + do_facet client sysctl -w lustre.fail_loc=0 } run_test 20 "ldlm_handle_enqueue succeeds on evicted export (3822)" +# $1 - fs num (1, 2, ...) +# $2 - mds +function find_dev_for_fs_and_mds() +{ + local fs=`ls /proc/fs/lustre/llite|head -n $1|tail -n1` + local fsuuid=`cat /proc/fs/lustre/llite/$fs/uuid` + $LCTL device_list | awk "/mdc.*$2.*$fsuuid/ {print \$4}" +} + +test_21() { + mdc1dev=`find_dev_for_fs_and_mds 1 mds1` + mdc2dev=`find_dev_for_fs_and_mds 2 mds1` + multiop $MOUNT1/f21 O + cancel_lru_locks MDC + # generate IT_OPEN to be replayed against existing file + multiop $MOUNT1/f21 o_Sc & + pid=$! + + # IT_OPEN will be committed by the failover time + replay_barrier mds1 + + # generate MDS_REINT_UNLINK to be replayed + rm -f $MOUNT2/f21 || return 1 + + # disable recovery on the both clients + $LCTL --device %$mdc1dev disable_recovery + $LCTL --device %$mdc2dev disable_recovery + facet_failover mds1 + + # let unlink to be replayed first + $LCTL --device %$mdc2dev enable_recovery + sleep $((TIMEOUT/2)) + + # now let open to be replaye + $LCTL --device %$mdc1dev enable_recovery + kill -USR1 $pid + wait $pid || return 2 +} +run_test 21 "open vs. unlink out of order replay" + +test_22() { # bug 6063 - AST during recovery + cancel_lru_locks MDC + cat /proc/fs/lustre/ldlm/namespaces/mds-*/lock_count + mdc1dev=`find_dev_for_fs_and_mds 1 mds1` + mdc2dev=`find_dev_for_fs_and_mds 2 mds1` + $LCTL --device %$mdc1dev disable_recovery + $LCTL --device %$mdc2dev disable_recovery + + replay_barrier mds1 + mknod $MOUNT1/${tdir}-1 c 0 0 # client1: request to be replayed + ls $MOUNT2 # client2: take lock needed for + facet_failover mds1 + + # let's recover 2nd connection with granted UPDATE lock + $LCTL --device %$mdc2dev enable_recovery + sleep $((TIMEOUT / 2)) + + LOCKS=`grep -v '^0$' /proc/fs/lustre/ldlm/namespaces/mds-*/lock_count` + if [ "$LOCKS" != "" ]; then + echo "The lock got replayed before mkdir is replayed: $LOCKS" + $LCTL --device %$mdc1dev enable_recovery + return 1 + fi + + # let's recover 1st connection with mkdir replay that needs the lock + $LCTL --device %$mdc1dev enable_recovery + sleep $TIMEOUT + + df $MOUNT || return 2 + return 0 +} +run_test 22 "AST during recovery" + if [ "$ONLY" != "setup" ]; then equals_msg test complete, cleaning up if [ $NOW ]; then