X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Freplay-single.sh;h=b8b77314d56c88a7b91c6f409d3d4b9fa94e63a9;hp=fd6901fa83b03804aa2c266dd9e4bf6afbb4a69b;hb=f44fe5abbc74ca79790c100a30193ded1ef1e6c9;hpb=0444a40d9838b868092c78d3bdd4c7c3a00199e6 diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index fd6901f..b8b7731 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -21,15 +21,17 @@ GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} require_dsh_mds || exit 0 # Skip these tests -# bug number: 17466 18857 LU-1473 -ALWAYS_EXCEPT="61d 33a 33b 62 $REPLAY_SINGLE_EXCEPT" +# bug number for skipped tests: +# LU-472 +ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT 61d" +# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -# 63 min 7 min AT AT AT AT" -[ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a 44b 65 66 67 68" +# 7.5 (min)" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="44b" [ $(facet_fstype $SINGLEMDS) = "zfs" ] && # bug number for skipped test: LU-1867 LU-3127 - ALWAYS_EXCEPT="$ALWAYS_EXCEPT 89 73b" + ALWAYS_EXCEPT="$ALWAYS_EXCEPT 89 73b" build_test_filter @@ -1177,6 +1179,10 @@ run_test 52 "time out lock replay (3764)" # bug 3462 - simultaneous MDC requests test_53a() { + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + cancel_lru_locks mdc # cleanup locks from former test cases mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed" mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed" @@ -1281,6 +1287,10 @@ test_53c() { run_test 53c "|X| open request and close request while two MDC requests in flight" test_53d() { + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + cancel_lru_locks mdc # cleanup locks from former test cases mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed" @@ -1527,8 +1537,11 @@ test_58b() { large_xattr_enabled && orig="$(generate_string $(max_xattr_size))" || orig="bar" + # Original extended attribute can be long. Print a small version of + # attribute if an error occurs + local sm_msg=$(printf "%.9s" $orig) - mount_client $MOUNT2 + mount_client $MOUNT2 || error "mount_client on $MOUNT2 failed" mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" touch $DIR/$tdir/$tfile || error "touch $DIR/$tdir/$tfile failed" replay_barrier $SINGLEMDS @@ -1536,43 +1549,52 @@ test_58b() { fail $SINGLEMDS new=$(get_xattr_value trusted.foo $MOUNT2/$tdir/$tfile) [[ "$new" = "$orig" ]] || - error "xattr set ($orig) is not what was returned ($new)" + error "xattr set ($sm_msg...) differs from xattr get ($new)" rm -f $DIR/$tdir/$tfile rmdir $DIR/$tdir cleanup_58 + wait_clients_import_state ${CLIENTS:-$HOSTNAME} "mgs" FULL } run_test 58b "test replay of setxattr op" test_58c() { # bug 16570 - local orig - local orig1 - local new - - trap cleanup_58 EXIT - - if large_xattr_enabled; then - local xattr_size=$(max_xattr_size) - orig="$(generate_string $((xattr_size / 2)))" - orig1="$(generate_string $xattr_size)" - else - orig="bar" - orig1="bar1" - fi + local orig + local orig1 + local new + + trap cleanup_58 EXIT + + if large_xattr_enabled; then + local xattr_size=$(max_xattr_size) + orig="$(generate_string $((xattr_size / 2)))" + orig1="$(generate_string $xattr_size)" + else + orig="bar" + orig1="bar1" + fi + + # PING_INTERVAL max(obd_timeout / 4, 1U) + sleep $((TIMEOUT / 4)) + + # Original extended attribute can be long. Print a small version of + # attribute if an error occurs + local sm_msg=$(printf "%.9s" $orig) + local sm_msg1=$(printf "%.9s" $orig1) - mount_client $MOUNT2 + mount_client $MOUNT2 || error "mount_client on $MOUNT2 failed" mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" touch $DIR/$tdir/$tfile || error "touch $DIR/$tdir/$tfile failed" drop_request "setfattr -n trusted.foo -v $orig $DIR/$tdir/$tfile" || error "drop_request for setfattr failed" new=$(get_xattr_value trusted.foo $MOUNT2/$tdir/$tfile) [[ "$new" = "$orig" ]] || - error "xattr set ($orig) is not what was returned ($new)" + error "xattr set ($sm_msg...) differs from xattr get ($new)" drop_reint_reply "setfattr -n trusted.foo1 \ -v $orig1 $DIR/$tdir/$tfile" || - error "drop_request for setfattr failed" + error "drop_reint_reply for setfattr failed" new=$(get_xattr_value trusted.foo1 $MOUNT2/$tdir/$tfile) [[ "$new" = "$orig1" ]] || - error "second xattr set ($orig1) not what was returned ($new)" + error "second xattr set ($sm_msg1...) differs xattr get ($new)" rm -f $DIR/$tdir/$tfile rmdir $DIR/$tdir cleanup_58 @@ -2006,15 +2028,6 @@ check_for_process () { killall_process $clients "$prog" -0 } -killall_process () { - local clients=${1:-$(hostname)} - local name=$2 - local signal=$3 - local rc=0 - - do_nodes $clients "killall $signal $name" -} - test_70b () { local clients=${CLIENTS:-$HOSTNAME} @@ -2029,6 +2042,10 @@ test_70b () { local start_ts=$(date +%s) local cmd="rundbench 1 -t $duration" local pid="" + if [ $MDSCOUNT -ge 2 ]; then + test_mkdir -p -c$MDSCOUNT $DIR/$tdir + $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir + fi do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \ PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \ DBENCH_LIB=$DBENCH_LIB TESTSUITE=$TESTSUITE TESTNAME=$TESTNAME \ @@ -2049,6 +2066,7 @@ test_70b () { elapsed=$(($(date +%s) - start_ts)) local num_failovers=0 + local fail_index=1 while [ $elapsed -lt $duration ]; do if ! check_for_process $clients dbench; then error_noexit "dbench stopped on some of $clients!" @@ -2056,20 +2074,418 @@ test_70b () { break fi sleep 1 - replay_barrier $SINGLEMDS + replay_barrier mds$fail_index sleep 1 # give clients a time to do operations # Increment the number of failovers num_failovers=$((num_failovers+1)) - log "$TESTNAME fail $SINGLEMDS $num_failovers times" - fail $SINGLEMDS + log "$TESTNAME fail mds$fail_index $num_failovers times" + fail mds$fail_index elapsed=$(($(date +%s) - start_ts)) + if [ $fail_index -ge $MDSCOUNT ]; then + fail_index=1 + else + fail_index=$((fail_index+1)) + fi done wait $pid || error "rundbench load on $clients failed!" } -run_test 70b "mds recovery; $CLIENTCOUNT clients" +run_test 70b "dbench ${MDSCOUNT}mdts recovery; $CLIENTCOUNT clients" # end multi-client tests +random_fail_mdt() { + local max_index=$1 + local duration=$2 + local monitor_pid=$3 + local elapsed + local start_ts=$(date +%s) + local num_failovers=0 + local fail_index + + elapsed=$(($(date +%s) - start_ts)) + while [ $elapsed -lt $duration ]; do + fail_index=$((RANDOM%max_index+1)) + kill -0 $monitor_pid || + error "$monitor_pid stopped" + sleep 120 + replay_barrier mds$fail_index + sleep 10 + # Increment the number of failovers + num_failovers=$((num_failovers+1)) + log "$TESTNAME fail mds$fail_index $num_failovers times" + fail mds$fail_index + elapsed=$(($(date +%s) - start_ts)) + done +} + +cleanup_70c() { + trap 0 + rm -f $DIR/replay-single.70c.lck + rm -rf /$DIR/$tdir +} + +test_70c () { + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=600 + + local elapsed + local start_ts=$(date +%s) + + trap cleanup_70c EXIT + ( + while [ ! -e $DIR/replay-single.70c.lck ]; do + test_mkdir -p -c$MDSCOUNT $DIR/$tdir || break + if [ $MDSCOUNT -ge 2 ]; then + $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir || + error "set default dirstripe failed" + fi + cd $DIR/$tdir || break + tar cf - /etc | tar xf - || error "tar failed in loop" + done + )& + tar_70c_pid=$! + echo "Started tar $tar_70c_pid" + + random_fail_mdt $MDSCOUNT $duration $tar_70c_pid + kill -0 $tar_70c_pid || error "tar $tar_70c_pid stopped" + + touch $DIR/replay-single.70c.lck + wait $tar_70c_pid || error "$?: tar failed" + + cleanup_70c + true +} +run_test 70c "tar ${MDSCOUNT}mdts recovery" + +cleanup_70d() { + trap 0 + kill -9 $mkdir_70d_pid +} + +test_70d () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + mkdir -p $DIR/$tdir + + local elapsed + local start_ts=$(date +%s) + + trap cleanup_70d EXIT + ( + while true; do + $LFS mkdir -i0 -c2 $DIR/$tdir/test || { + echo "mkdir fails" + break + } + $LFS mkdir -i1 -c2 $DIR/$tdir/test1 || { + echo "mkdir fails" + break + } + + touch $DIR/$tdir/test/a || { + echo "touch fails" + break; + } + mkdir $DIR/$tdir/test/b || { + echo "mkdir fails" + break; + } + rm -rf $DIR/$tdir/test || { + echo "rmdir fails" + break + } + + touch $DIR/$tdir/test1/a || { + echo "touch fails" + break; + } + mkdir $DIR/$tdir/test1/b || { + echo "mkdir fails" + break; + } + + rm -rf $DIR/$tdir/test1 || { + echo "rmdir fails" + break + } + done + )& + mkdir_70d_pid=$! + echo "Started $mkdir_70d_pid" + + random_fail_mdt $MDSCOUNT $duration $mkdir_70d_pid + kill -0 $mkdir_70d_pid || error "mkdir/rmdir $mkdir_70d_pid stopped" + + cleanup_70d + true +} +run_test 70d "mkdir/rmdir striped dir ${MDSCOUNT}mdts recovery" + +cleanup_70e() { + trap 0 + kill -9 $rename_70e_pid +} + +test_70e () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + echo ha > /proc/sys/lnet/debug + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + mkdir -p $DIR/$tdir + $LFS mkdir -i0 $DIR/$tdir/test_0 + $LFS mkdir -i0 $DIR/$tdir/test_1 + touch $DIR/$tdir/test_0/a + touch $DIR/$tdir/test_1/b + trap cleanup_70e EXIT + ( + while true; do + mrename $DIR/$tdir/test_0/a $DIR/$tdir/test_1/b > \ + /dev/null || { + echo "a->b fails" + break; + } + + checkstat $DIR/$tdir/test_0/a && { + echo "a still exists" + break + } + + checkstat $DIR/$tdir/test_1/b || { + echo "b still exists" + break + } + + touch $DIR/$tdir/test_0/a || { + echo "touch a fails" + break + } + + mrename $DIR/$tdir/test_1/b $DIR/$tdir/test_0/a > \ + /dev/null || { + echo "a->a fails" + break; + } + done + )& + rename_70e_pid=$! + echo "Started $rename_70e_pid" + + random_fail_mdt 2 $duration $rename_70e_pid + kill -0 $rename_70e_pid || error "rename $rename_70e_pid stopped" + + cleanup_70e + true +} +run_test 70e "rename cross-MDT with random fails" + +test_70f_write_and_read(){ + local srcfile=$1 + local stopflag=$2 + local client + + echo "Write/read files in: '$DIR/$tdir', clients: '$CLIENTS' ..." + for client in ${CLIENTS//,/ }; do + [ -f $stopflag ] || return + + local tgtfile=$DIR/$tdir/$tfile.$client + do_node $client dd $DD_OPTS bs=1M count=10 if=$srcfile \ + of=$tgtfile 2>/dev/null || + error "dd $DD_OPTS bs=1M count=10 if=$srcfile " \ + "of=$tgtfile failed on $client, rc=$?" + done + + local prev_client=$(echo ${CLIENTS//,/ } | awk '{ print $NF }') + local index=0 + + for client in ${CLIENTS//,/ }; do + [ -f $stopflag ] || return + + # flush client cache in case test is running on only one client + # do_node $client cancel_lru_locks osc + do_node $client $LCTL set_param ldlm.namespaces.*.lru_size=clear + + tgtfile=$DIR/$tdir/$tfile.$client + local md5=$(do_node $prev_client "md5sum $tgtfile") + [ ${checksum[$index]// */} = ${md5// */} ] || + error "$tgtfile: checksum doesn't match on $prev_client" + index=$((index + 1)) + prev_client=$client + done +} + +test_70f_loop(){ + local srcfile=$1 + local stopflag=$2 + DD_OPTS= + + mkdir -p $DIR/$tdir || error "cannot create $DIR/$tdir directory" + $SETSTRIPE -c -1 $DIR/$tdir || error "cannot $SETSTRIPE $DIR/$tdir" + + touch $stopflag + while [ -f $stopflag ]; do + test_70f_write_and_read $srcfile $stopflag + # use direct IO and buffer cache in turns if loop + [ -n "$DD_OPTS" ] && DD_OPTS="" || DD_OPTS="oflag=direct" + done +} + +test_70f_cleanup() { + trap 0 + rm -f $TMP/$tfile.stop + do_nodes $CLIENTS rm -f $TMP/$tfile + rm -f $DIR/$tdir/$tfile.* +} + +test_70f() { +# [ x$ost1failover_HOST = x$ost_HOST ] && +# { skip "Failover host not defined" && return; } +# [ -z "$CLIENTS" ] && +# { skip "CLIENTS are not specified." && return; } +# [ $CLIENTCOUNT -lt 2 ] && +# { skip "Need 2 or more clients, have $CLIENTCOUNT" && return; } + + echo "mount clients $CLIENTS ..." + zconf_mount_clients $CLIENTS $MOUNT + + local srcfile=$TMP/$tfile + local client + local index=0 + + trap test_70f_cleanup EXIT + # create a different source file local to each client node so we can + # detect if the file wasn't written out properly after failover + do_nodes $CLIENTS dd bs=1M count=10 if=/dev/urandom of=$srcfile \ + 2>/dev/null || error "can't create $srcfile on $CLIENTS" + for client in ${CLIENTS//,/ }; do + checksum[$index]=$(do_node $client "md5sum $srcfile") + index=$((index + 1)) + done + + local duration=120 + [ "$SLOW" = "no" ] && duration=60 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + local stopflag=$TMP/$tfile.stop + test_70f_loop $srcfile $stopflag & + local pid=$! + + local elapsed=0 + local num_failovers=0 + local start_ts=$SECONDS + while [ $elapsed -lt $duration ]; do + sleep 3 + replay_barrier ost1 + sleep 1 + num_failovers=$((num_failovers + 1)) + log "$TESTNAME failing OST $num_failovers times" + fail ost1 + sleep 2 + elapsed=$((SECONDS - start_ts)) + done + + rm -f $stopflag + wait $pid + test_70f_cleanup +} +run_test 70f "OSS O_DIRECT recovery with $CLIENTCOUNT clients" + +cleanup_71a() { + trap 0 + kill -9 $mkdir_71a_pid +} + +random_double_fail_mdt() { + local max_index=$1 + local duration=$2 + local monitor_pid=$3 + local elapsed + local start_ts=$(date +%s) + local num_failovers=0 + local fail_index + local second_index + + elapsed=$(($(date +%s) - start_ts)) + while [ $elapsed -lt $duration ]; do + fail_index=$((RANDOM%max_index + 1)) + if [ $fail_index -eq $max_index ]; then + second_index=1 + else + second_index=$((fail_index + 1)) + fi + kill -0 $monitor_pid || + error "$monitor_pid stopped" + sleep 120 + replay_barrier mds$fail_index + replay_barrier mds$second_index + sleep 10 + # Increment the number of failovers + num_failovers=$((num_failovers+1)) + log "fail mds$fail_index mds$second_index $num_failovers times" + fail mds${fail_index},mds${second_index} + elapsed=$(($(date +%s) - start_ts)) + done +} + +test_71a () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + mkdir -p $DIR/$tdir + + local elapsed + local start_ts=$(date +%s) + + trap cleanup_71a EXIT + ( + while true; do + $LFS mkdir -i0 -c2 $DIR/$tdir/test + rmdir $DIR/$tdir/test + done + )& + mkdir_71a_pid=$! + echo "Started $mkdir_71a_pid" + + random_double_fail_mdt 2 $duration $mkdir_71a_pid + kill -0 $mkdir_71a_pid || error "mkdir/rmdir $mkdir_71a_pid stopped" + + cleanup_71a + true +} +run_test 71a "mkdir/rmdir striped dir with 2 mdts recovery" + test_73a() { multiop_bg_pause $DIR/$tfile O_tSc || error "multiop_bg_pause $DIR/$tfile failed" @@ -2122,11 +2538,16 @@ run_test 74 "Ensure applications don't fail waiting for OST recovery" remote_dir_check_80() { local MDTIDX=1 - local diridx=$($GETSTRIPE -M $remote_dir) + local diridx + local fileidx + + diridx=$($GETSTRIPE -M $remote_dir) || + error "$GETSTRIPE -M $remote_dir failed" [ $diridx -eq $MDTIDX ] || error "$diridx != $MDTIDX" createmany -o $remote_dir/f-%d 20 || error "creation failed" - local fileidx=$($GETSTRIPE -M $remote_dir/f-1) + fileidx=$($GETSTRIPE -M $remote_dir/f-1) || + error "$GETSTRIPE -M $remote_dir/f-1 failed" [ $fileidx -eq $MDTIDX ] || error "$fileidx != $MDTIDX" return 0 @@ -2148,6 +2569,7 @@ test_80a() { $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! + replay_barrier mds1 fail mds${MDTIDX} wait $CLIENT_PID || error "remote creation failed" @@ -2175,6 +2597,8 @@ test_80b() { $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! + replay_barrier mds1 + replay_barrier mds2 fail mds$((MDTIDX + 1)) wait $CLIENT_PID || error "remote creation failed" @@ -2202,6 +2626,8 @@ test_80c() { $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! + replay_barrier mds1 + replay_barrier mds2 fail mds${MDTIDX} fail mds$((MDTIDX + 1)) @@ -2228,6 +2654,9 @@ test_80d() { # sleep 3 seconds to make sure MDTs are failed after # lfs mkdir -i has finished on all of MDTs. sleep 3 + + replay_barrier mds1 + replay_barrier mds2 fail mds${MDTIDX},mds$((MDTIDX + 1)) wait $CLIENT_PID || error "remote creation failed" @@ -2259,6 +2688,7 @@ test_80e() { # lfs mkdir -i has finished on all of MDTs. sleep 3 + replay_barrier mds1 fail mds${MDTIDX} wait $CLIENT_PID || error "remote creation failed" @@ -2285,6 +2715,7 @@ test_80f() { $LFS mkdir -i $MDTIDX $remote_dir & local CLIENT_PID=$! + replay_barrier mds2 fail mds$((MDTIDX + 1)) wait $CLIENT_PID || error "remote creation failed" @@ -2316,6 +2747,8 @@ test_80g() { # lfs mkdir -i has finished on all of MDTs. sleep 3 + replay_barrier mds1 + replay_barrier mds2 fail mds${MDTIDX} fail mds$((MDTIDX + 1)) @@ -2343,6 +2776,8 @@ test_80h() { # lfs mkdir -i has finished on all of MDTs. sleep 3 + replay_barrier mds1 + replay_barrier mds2 fail mds${MDTIDX},mds$((MDTIDX + 1)) wait $CLIENT_PID || error "remote dir creation failed" @@ -2373,6 +2808,7 @@ test_81a() { rmdir $remote_dir & local CLIENT_PID=$! + replay_barrier mds2 fail mds$((MDTIDX + 1)) wait $CLIENT_PID || error "rm remote dir failed" @@ -2402,6 +2838,7 @@ test_81b() { rmdir $remote_dir & local CLIENT_PID=$! + replay_barrier mds1 fail mds${MDTIDX} wait $CLIENT_PID || error "rm remote dir failed" @@ -2432,6 +2869,8 @@ test_81c() { rmdir $remote_dir & local CLIENT_PID=$! + replay_barrier mds1 + replay_barrier mds2 fail mds${MDTIDX} fail mds$((MDTIDX + 1)) @@ -2458,6 +2897,8 @@ test_81d() { rmdir $remote_dir & local CLIENT_PID=$! + replay_barrier mds1 + replay_barrier mds2 fail mds${MDTIDX},mds$((MDTIDX + 1)) wait $CLIENT_PID || error "rm remote dir failed" @@ -2489,6 +2930,7 @@ test_81e() { local CLIENT_PID=$! do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0 + replay_barrier mds1 fail mds${MDTIDX} wait $CLIENT_PID || error "rm remote dir failed" @@ -2519,6 +2961,7 @@ test_81f() { rmdir $remote_dir & local CLIENT_PID=$! + replay_barrier mds2 fail mds$((MDTIDX + 1)) wait $CLIENT_PID || error "rm remote dir failed" @@ -2549,6 +2992,8 @@ test_81g() { rmdir $remote_dir & local CLIENT_PID=$! + replay_barrier mds1 + replay_barrier mds2 fail mds${MDTIDX} fail mds$((MDTIDX + 1)) @@ -2575,6 +3020,8 @@ test_81h() { rmdir $remote_dir & local CLIENT_PID=$! + replay_barrier mds1 + replay_barrier mds2 fail mds${MDTIDX},mds$((MDTIDX + 1)) wait $CLIENT_PID || error "rm remote dir failed" @@ -2607,7 +3054,7 @@ test_85a() { #bug 16774 done lov_id=$(lctl dl | grep "clilov") - addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}') + addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $NF}') count=$(lctl get_param -n \ ldlm.namespaces.*MDT0000*$addr.lock_unused_count) echo "before recovery: unused locks count = $count" @@ -2627,7 +3074,7 @@ run_test 85a "check the cancellation of unused locks during recovery(IBITS)" test_85b() { #bug 16774 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1" - do_facet mgs $LCTL pool_new $FSNAME.$TESTNAME || + create_pool $FSNAME.$TESTNAME || error "unable to create pool $TESTNAME" do_facet mgs $LCTL pool_add $FSNAME.$TESTNAME $FSNAME-OST0000 || error "unable to add pool $TESTNAME" @@ -2647,11 +3094,11 @@ test_85b() { #bug 16774 done lov_id=$(lctl dl | grep "clilov") - addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}') - count=$(lctl get_param \ - -n ldlm.namespaces.*OST0000*$addr.lock_unused_count) + addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $NF}') + count=$(lctl get_param -n \ + ldlm.namespaces.*OST0000*$addr.lock_unused_count) echo "before recovery: unused locks count = $count" - [ $count != 0 ] || error "unused locks ($count) should be zero" + [ $count -ne 0 ] || error "unused locks ($count) should be zero" fail ost1 @@ -2680,7 +3127,7 @@ test_86() { } run_test 86 "umount server after clear nid_stats should not hit LBUG" -test_87() { +test_87a() { do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0" replay_barrier ost1 @@ -2696,7 +3143,7 @@ test_87() { error "New checksum $cksum2 does not match original $cksum" fi } -run_test 87 "write replay" +run_test 87a "write replay" test_87b() { do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0" @@ -2847,6 +3294,8 @@ test_90() { # bug 19494 return 0 fi fi + # ensure all OSTs are active to allow allocations + wait_osts_up mkdir $dir || error "mkdir $dir failed" @@ -2918,7 +3367,7 @@ test_90() { # bug 19494 } run_test 90 "lfs find identifies the missing striped file segments" -test_93() { +test_93a() { local server_version=$(lustre_version_code $SINGLEMDS) [[ $server_version -ge $(version_code 2.6.90) ]] || [[ $server_version -ge $(version_code 2.5.4) && @@ -2940,7 +3389,28 @@ test_93() { do_facet ost1 "$LCTL set_param fail_loc=0x715" fail ost1 } -run_test 93 "replay + reconnect" +run_test 93a "replay + reconnect" + +test_93b() { + local server_version=$(lustre_version_code $SINGLEMDS) + [[ $server_version -ge $(version_code 2.7.90) ]] || + { skip "Need MDS version 2.7.90+"; return; } + + cancel_lru_locks mdc + + createmany -o $DIR/$tfile 20 || + error "createmany -o $DIR/$tfile failed" + + #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715 + # We need to emulate a state that MDT is waiting for other clients + # not completing the recovery. Final ping is queued, but reply will be + # sent on the recovery completion. It is done by sleep before + # processing final pings + do_facet mds1 "$LCTL set_param fail_val=80" + do_facet mds1 "$LCTL set_param fail_loc=0x715" + fail mds1 +} +run_test 93b "replay + reconnect on mds" striped_dir_check_100() { local striped_dir=$DIR/$tdir/striped_dir @@ -3035,6 +3505,1095 @@ test_101() { #LU-5648 } run_test 101 "Shouldn't reassign precreated objs to other files after recovery" +test_102a() { + local idx + local facet + local num + local i + local pids pid + + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + + $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed" + idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir)) + facet="mds$((0x$idx + 1))" + + # get current value of max_mod_rcps_in_flight + num=$($LCTL get_param -n \ + mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight) + # set default value if client does not support multi mod RPCs + [ -z "$num" ] && num=1 + + echo "creating $num files ..." + umask 0022 + for i in $(seq $num); do + touch $DIR/$tdir/file-$i + done + + # drop request on MDT to force resend + #define OBD_FAIL_MDS_REINT_MULTI_NET 0x159 + do_facet $facet "$LCTL set_param fail_loc=0x159" + echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..." + for i in $(seq $num); do + chmod 0600 $DIR/$tdir/file-$i & + pids="$pids $!" + done + sleep 1 + do_facet $facet "$LCTL set_param fail_loc=0" + for pid in $pids; do + wait $pid || error "chmod failed" + done + echo "done ($(date +%H:%M:%S))" + + # check chmod succeed + for i in $(seq $num); do + checkstat -vp 0600 $DIR/$tdir/file-$i + done + + rm -rf $DIR/$tdir +} +run_test 102a "check resend (request lost) with multiple modify RPCs in flight" + +test_102b() { + local idx + local facet + local num + local i + local pids pid + + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + + $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed" + idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir)) + facet="mds$((0x$idx + 1))" + + # get current value of max_mod_rcps_in_flight + num=$($LCTL get_param -n \ + mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight) + # set default value if client does not support multi mod RPCs + [ -z "$num" ] && num=1 + + echo "creating $num files ..." + umask 0022 + for i in $(seq $num); do + touch $DIR/$tdir/file-$i + done + + # drop reply on MDT to force reconstruction + #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a + do_facet $facet "$LCTL set_param fail_loc=0x15a" + echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..." + for i in $(seq $num); do + chmod 0600 $DIR/$tdir/file-$i & + pids="$pids $!" + done + sleep 1 + do_facet $facet "$LCTL set_param fail_loc=0" + for pid in $pids; do + wait $pid || error "chmod failed" + done + echo "done ($(date +%H:%M:%S))" + + # check chmod succeed + for i in $(seq $num); do + checkstat -vp 0600 $DIR/$tdir/file-$i + done + + rm -rf $DIR/$tdir +} +run_test 102b "check resend (reply lost) with multiple modify RPCs in flight" + +test_102c() { + local idx + local facet + local num + local i + local pids pid + + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + + $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed" + idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir)) + facet="mds$((0x$idx + 1))" + + # get current value of max_mod_rcps_in_flight + num=$($LCTL get_param -n \ + mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight) + # set default value if client does not support multi mod RPCs + [ -z "$num" ] && num=1 + + echo "creating $num files ..." + umask 0022 + for i in $(seq $num); do + touch $DIR/$tdir/file-$i + done + + replay_barrier $facet + + # drop reply on MDT + #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a + do_facet $facet "$LCTL set_param fail_loc=0x15a" + echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..." + for i in $(seq $num); do + chmod 0600 $DIR/$tdir/file-$i & + pids="$pids $!" + done + sleep 1 + do_facet $facet "$LCTL set_param fail_loc=0" + + # fail MDT + fail $facet + + for pid in $pids; do + wait $pid || error "chmod failed" + done + echo "done ($(date +%H:%M:%S))" + + # check chmod succeed + for i in $(seq $num); do + checkstat -vp 0600 $DIR/$tdir/file-$i + done + + rm -rf $DIR/$tdir +} +run_test 102c "check replay w/o reconstruction with multiple mod RPCs in flight" + +test_102d() { + local idx + local facet + local num + local i + local pids pid + + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + + $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed" + idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir)) + facet="mds$((0x$idx + 1))" + + # get current value of max_mod_rcps_in_flight + num=$($LCTL get_param -n \ + mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight) + # set default value if client does not support multi mod RPCs + [ -z "$num" ] && num=1 + + echo "creating $num files ..." + umask 0022 + for i in $(seq $num); do + touch $DIR/$tdir/file-$i + done + + # drop reply on MDT + #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a + do_facet $facet "$LCTL set_param fail_loc=0x15a" + echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..." + for i in $(seq $num); do + chmod 0600 $DIR/$tdir/file-$i & + pids="$pids $!" + done + sleep 1 + + # write MDT transactions to disk + do_facet $facet "sync; sync; sync" + + do_facet $facet "$LCTL set_param fail_loc=0" + + # fail MDT + fail $facet + + for pid in $pids; do + wait $pid || error "chmod failed" + done + echo "done ($(date +%H:%M:%S))" + + # check chmod succeed + for i in $(seq $num); do + checkstat -vp 0600 $DIR/$tdir/file-$i + done + + rm -rf $DIR/$tdir +} +run_test 102d "check replay & reconstruction with multiple mod RPCs in flight" + +test_103() { + remote_mds_nodsh && skip "remote MDS with nodsh" && return +#define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162 + do_facet mds1 $LCTL set_param fail_loc=0x80000162 + + mkdir -p $DIR/$tdir + createmany -o $DIR/$tdir/t- 30 || + error "create files on remote directory failed" + sync + rm -rf $DIR/$tdir/t-* + sync +#MDS should crash with tr->otr_next_id overflow + fail mds1 +} +run_test 103 "Check otr_next_id overflow" + + +check_striped_dir_110() +{ + $CHECKSTAT -t dir $DIR/$tdir/striped_dir || + error "create striped dir failed" + local stripe_count=$($LFS getdirstripe -c $DIR/$tdir/striped_dir) + [ $stripe_count -eq $MDSCOUNT ] || + error "$stripe_count != 2 after recovery" +} + +test_110a() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + replay_barrier mds1 + $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir + fail mds1 + + check_striped_dir_110 || error "check striped_dir failed" + rm -rf $DIR/$tdir || error "rmdir failed" + + return 0 +} +run_test 110a "DNE: create striped dir, fail MDT1" + +test_110b() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + replay_barrier mds1 + $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir + umount $MOUNT + fail mds1 + zconf_mount $(hostname) $MOUNT + client_up || return 1 + + check_striped_dir_110 || error "check striped_dir failed" + + rm -rf $DIR/$tdir || error "rmdir failed" + + return 0 +} +run_test 110b "DNE: create striped dir, fail MDT1 and client" + +test_110c() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + replay_barrier mds2 + $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir + fail mds2 + + check_striped_dir_110 || error "check striped_dir failed" + + rm -rf $DIR/$tdir || error "rmdir failed" + + return 0 +} +run_test 110c "DNE: create striped dir, fail MDT2" + +test_110d() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + replay_barrier mds2 + $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir + umount $MOUNT + fail mds2 + zconf_mount $(hostname) $MOUNT + client_up || return 1 + + check_striped_dir_110 || error "check striped_dir failed" + + rm -rf $DIR/$tdir || error "rmdir failed" + + return 0 +} +run_test 110d "DNE: create striped dir, fail MDT2 and client" + +test_110e() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + replay_barrier mds2 + $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir + umount $MOUNT + replay_barrier mds1 + fail mds1,mds2 + zconf_mount $(hostname) $MOUNT + client_up || return 1 + + check_striped_dir_110 || error "check striped_dir failed" + + rm -rf $DIR/$tdir || error "rmdir failed" + + return 0 +} +run_test 110e "DNE: create striped dir, uncommit on MDT2, fail client/MDT1/MDT2" + +test_110f() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + replay_barrier mds1 + replay_barrier mds2 + $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir + fail mds1,mds2 + + check_striped_dir_110 || error "check striped_dir failed" + + rm -rf $DIR/$tdir || error "rmdir failed" + + return 0 +} +run_test 110f "DNE: create striped dir, fail MDT1/MDT2" + +test_110g() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + replay_barrier mds1 + $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir + umount $MOUNT + replay_barrier mds2 + fail mds1,mds2 + zconf_mount $(hostname) $MOUNT + client_up || return 1 + + check_striped_dir_110 || error "check striped_dir failed" + + rm -rf $DIR/$tdir || error "rmdir failed" + + return 0 +} +run_test 110g "DNE: create striped dir, uncommit on MDT1, fail client/MDT1/MDT2" + +test_111a() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir + replay_barrier mds1 + rm -rf $DIR/$tdir/striped_dir + fail mds1 + + $CHECKSTAT -t dir $DIR/$tdir/striped_dir && + error "striped dir still exists" + return 0 +} +run_test 111a "DNE: unlink striped dir, fail MDT1" + +test_111b() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir + replay_barrier mds2 + rm -rf $DIR/$tdir/striped_dir + umount $MOUNT + fail mds2 + zconf_mount $(hostname) $MOUNT + client_up || return 1 + + $CHECKSTAT -t dir $DIR/$tdir/striped_dir && + error "striped dir still exists" + return 0 +} +run_test 111b "DNE: unlink striped dir, fail MDT2" + +test_111c() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir + replay_barrier mds1 + rm -rf $DIR/$tdir/striped_dir + umount $MOUNT + replay_barrier mds2 + fail mds1,mds2 + zconf_mount $(hostname) $MOUNT + client_up || return 1 + $CHECKSTAT -t dir $DIR/$tdir/striped_dir && + error "striped dir still exists" + return 0 +} +run_test 111c "DNE: unlink striped dir, uncommit on MDT1, fail client/MDT1/MDT2" + +test_111d() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir + replay_barrier mds2 + rm -rf $DIR/$tdir/striped_dir + umount $MOUNT + replay_barrier mds1 + fail mds1,mds2 + zconf_mount $(hostname) $MOUNT + client_up || return 1 + $CHECKSTAT -t dir $DIR/$tdir/striped_dir && + error "striped dir still exists" + + return 0 +} +run_test 111d "DNE: unlink striped dir, uncommit on MDT2, fail client/MDT1/MDT2" + +test_111e() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir + replay_barrier mds2 + rm -rf $DIR/$tdir/striped_dir + replay_barrier mds1 + fail mds1,mds2 + $CHECKSTAT -t dir $DIR/$tdir/striped_dir && + error "striped dir still exists" + return 0 +} +run_test 111e "DNE: unlink striped dir, uncommit on MDT2, fail MDT1/MDT2" + +test_111f() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir + replay_barrier mds1 + rm -rf $DIR/$tdir/striped_dir + replay_barrier mds2 + fail mds1,mds2 + $CHECKSTAT -t dir $DIR/$tdir/striped_dir && + error "striped dir still exists" + return 0 +} +run_test 111f "DNE: unlink striped dir, uncommit on MDT1, fail MDT1/MDT2" + +test_111g() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + mkdir -p $DIR/$tdir + $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir + replay_barrier mds1 + replay_barrier mds2 + rm -rf $DIR/$tdir/striped_dir + fail mds1,mds2 + $CHECKSTAT -t dir $DIR/$tdir/striped_dir && + error "striped dir still exists" + return 0 +} +run_test 111g "DNE: unlink striped dir, fail MDT1/MDT2" + +test_112_rename_prepare() { + mkdir -p $DIR/$tdir/src_dir + $LFS mkdir -i 1 $DIR/$tdir/src_dir/src_child || + error "create remote source failed" + + touch $DIR/$tdir/src_dir/src_child/a + + $LFS mkdir -i 2 $DIR/$tdir/tgt_dir || + error "create remote target dir failed" + + $LFS mkdir -i 3 $DIR/$tdir/tgt_dir/tgt_child || + error "create remote target child failed" +} + +test_112_check() { + find $DIR/$tdir/ + $CHECKSTAT -t dir $DIR/$tdir/src_dir/src_child && + error "src_child still exists after rename" + + $CHECKSTAT -t file $DIR/$tdir/tgt_dir/tgt_child/a || + error "missing file(a) after rename" +} + +test_112a() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds1 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + fail mds1 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112a "DNE: cross MDT rename, fail MDT1" + +test_112b() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds2 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds2 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112b "DNE: cross MDT rename, fail MDT2" + +test_112c() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds3 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds3 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112c "DNE: cross MDT rename, fail MDT3" + +test_112d() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds4 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds4 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112d "DNE: cross MDT rename, fail MDT4" + +test_112e() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds1 + replay_barrier mds2 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds1,mds2 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112e "DNE: cross MDT rename, fail MDT1 and MDT2" + +test_112f() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds1 + replay_barrier mds3 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds1,mds3 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112f "DNE: cross MDT rename, fail MDT1 and MDT3" + +test_112g() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds1 + replay_barrier mds4 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds1,mds4 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112g "DNE: cross MDT rename, fail MDT1 and MDT4" + +test_112h() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds2 + replay_barrier mds3 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds2,mds3 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112h "DNE: cross MDT rename, fail MDT2 and MDT3" + +test_112i() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds2 + replay_barrier mds4 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds2,mds4 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112i "DNE: cross MDT rename, fail MDT2 and MDT4" + +test_112j() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds3 + replay_barrier mds4 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds3,mds4 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112j "DNE: cross MDT rename, fail MDT3 and MDT4" + +test_112k() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds1 + replay_barrier mds2 + replay_barrier mds3 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds1,mds2,mds3 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112k "DNE: cross MDT rename, fail MDT1,MDT2,MDT3" + +test_112l() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds1 + replay_barrier mds2 + replay_barrier mds4 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds1,mds2,mds4 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112l "DNE: cross MDT rename, fail MDT1,MDT2,MDT4" + +test_112m() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds1 + replay_barrier mds3 + replay_barrier mds4 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds1,mds3,mds4 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112m "DNE: cross MDT rename, fail MDT1,MDT3,MDT4" + +test_112n() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + + test_112_rename_prepare + replay_barrier mds2 + replay_barrier mds3 + replay_barrier mds4 + + mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child || + error "rename dir cross MDT failed!" + + fail mds2,mds3,mds4 + + test_112_check + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 112n "DNE: cross MDT rename, fail MDT2,MDT3,MDT4" + +test_115() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + local fail_index=0 + local index + local i + local j + + mkdir -p $DIR/$tdir + for ((j=0;j<$((MDSCOUNT));j++)); do + fail_index=$((fail_index+1)) + index=$((fail_index % MDSCOUNT)) + replay_barrier mds$((index + 1)) + for ((i=0;i<5;i++)); do + test_mkdir -i$index -c$MDSCOUNT $DIR/$tdir/test_$i || + error "create striped dir $DIR/$tdir/test_$i" + done + + fail mds$((index + 1)) + for ((i=0;i<5;i++)); do + checkstat -t dir $DIR/$tdir/test_$i || + error "$DIR/$tdir/test_$i does not exist!" + done + rm -rf $DIR/$tdir/test_* || + error "rmdir fails" + done +} +run_test 115 "failover for create/unlink striped directory" + +test_116a() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.55) ] && + skip "Do not support large update log before 2.7.55" && + return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + local fail_index=0 + + mkdir -p $DIR/$tdir + replay_barrier mds1 + + # OBD_FAIL_SPLIT_UPDATE_REC 0x1702 + do_facet mds1 "lctl set_param fail_loc=0x80001702" + $LFS setdirstripe -c$MDSCOUNT $DIR/$tdir/striped_dir + + fail mds1 + $CHECKSTAT -t dir $DIR/$tdir/striped_dir || + error "stried_dir does not exists" +} +run_test 116a "large update log master MDT recovery" + +test_116b() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.55) ] && + skip "Do not support large update log before 2.7.55" && + return 0 + + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + local fail_index=0 + + mkdir -p $DIR/$tdir + replay_barrier mds2 + + # OBD_FAIL_SPLIT_UPDATE_REC 0x1702 + do_facet mds2 "lctl set_param fail_loc=0x80001702" + $LFS setdirstripe -c$MDSCOUNT $DIR/$tdir/striped_dir + + fail mds2 + $CHECKSTAT -t dir $DIR/$tdir/striped_dir || + error "stried_dir does not exists" +} +run_test 116b "large update log slave MDT recovery" + +test_117() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + local index + local mds_indexs + + mkdir -p $DIR/$tdir + $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir + $LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1 + sleep 2 + + # Let's set rdonly on all MDTs, so client will send + # replay requests on all MDTs and replay these requests + # at the same time. This test will verify the recovery + # will not be deadlock in this case, LU-7531. + for ((index = 0; index < $((MDSCOUNT)); index++)); do + replay_barrier mds$((index + 1)) + if [ -z $mds_indexs ]; then + mds_indexs="${mds_indexs}mds$((index+1))" + else + mds_indexs="${mds_indexs},mds$((index+1))" + fi + done + + rm -rf $DIR/$tdir/remote_dir + rm -rf $DIR/$tdir/remote_dir_1 + + fail $mds_indexs + + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2" + +test_118() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] && + skip "Do not support large update log before 2.7.64" && + return 0 + + mkdir -p $DIR/$tdir + + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir || + error "setdirstripe fails" + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 || + error "setdirstripe fails 1" + rm -rf $DIR/$tdir/striped_dir* || error "rmdir fails" + + # OBD_FAIL_INVALIDATE_UPDATE 0x1705 + do_facet mds1 "lctl set_param fail_loc=0x1705" + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 + do_facet mds1 "lctl set_param fail_loc=0x0" + + replay_barrier mds1 + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 + fail mds1 + + true +} +run_test 118 "invalidate osp update will not cause update log corruption" + +test_119() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] && + skip "Do not support large update log before 2.7.64" && + return 0 + local stripe_count + local hard_timeout=$(do_facet mds1 \ + "lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard") + + local clients=${CLIENTS:-$HOSTNAME} + local time_min=$(recovery_time_min) + + mkdir -p $DIR/$tdir + mkdir $DIR/$tdir/tmp + rmdir $DIR/$tdir/tmp + + replay_barrier mds1 + mkdir $DIR/$tdir/dir_1 + for ((i = 0; i < 20; i++)); do + $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i + done + + stop mds1 + change_active mds1 + wait_for_facet mds1 + + #define OBD_FAIL_TGT_REPLAY_DELAY 0x714 + do_facet mds1 $LCTL set_param fail_loc=0x80000714 + #sleep (timeout + 5), so mds will evict the client exports, + #but DNE update recovery will keep going. + do_facet mds1 $LCTL set_param fail_val=$((time_min + 5)) + + mount_facet mds1 "-o recovery_time_hard=$time_min" + + wait_clients_import_state "$clients" mds1 FULL + + clients_up || clients_up || error "failover df: $?" + + #revert back the hard timeout + do_facet mds1 $LCTL set_param \ + mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout + + for ((i = 0; i < 20; i++)); do + stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i) + [ $stripe_count == 2 ] || { + error "stripe_dir-$i creation replay fails" + break + } + done +} +run_test 119 "timeout of normal replay does not cause DNE replay fails " + +test_120() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] && + skip "Do not support large update log before 2.7.64" && + return 0 + + mkdir $DIR/$tdir + replay_barrier_nosync mds1 + for ((i = 0; i < 20; i++)); do + mkdir $DIR/$tdir/dir-$i || { + error "create dir-$i fails" + break + } + $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i || { + error "create stripe_dir-$i fails" + break + } + done + + fail_abort mds1 + + for ((i = 0; i < 20; i++)); do + [ ! -e "$DIR/$tdir/dir-$i" ] || { + error "dir-$i still exists" + break + } + [ ! -e "$DIR/$tdir/stripe_dir-$i" ] || { + error "stripe_dir-$i still exists" + break + } + done +} +run_test 120 "DNE fail abort should stop both normal and DNE replay" + complete $SECONDS check_and_cleanup_lustre exit_status