X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Freplay-single.sh;h=31586b2a7798b46b850e01536295edb1c48a5cdf;hp=cb52a4ba117ac7333240dc6fbe7d882cc3998500;hb=b9e1bb635039c6d2d985754a9a029c9d5c20b569;hpb=0136a91b6d629556ef091f5ca210c13772207df9 diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index cb52a4b..31586b2 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -21,8 +21,9 @@ GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} require_dsh_mds || exit 0 # Skip these tests -# bug number for skipped tests: b=17466/LU-472 -ALWAYS_EXCEPT=" 61d $REPLAY_SINGLE_EXCEPT" +# bug number for skipped tests: +# LU-472 +ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT 61d" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! case "$(lsb_release -sr)" in # only disable tests for el7 @@ -31,8 +32,8 @@ case "$(lsb_release -sr)" in # only disable tests for el7 ;; esac -# 63 min 7 min AT AT AT AT" -[ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a 44b 65 66 67 68" +# 7.5 (min)" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="44b" [ $(facet_fstype $SINGLEMDS) = "zfs" ] && # bug number for skipped test: LU-1867 LU-3127 @@ -872,7 +873,7 @@ test_40(){ lctl get_param mdc.*.connect_flags | grep -q layout_lock && skip "layout_lock needs MDS connection for IO" && return 0 - $LCTL mark multiop $MOUNT/$tfile OS_c + $LCTL mark "$HOSTNAME multiop $MOUNT/$tfile OS_c" multiop $MOUNT/$tfile OS_c & PID=$! writeme -s $MOUNT/${tfile}-2 & @@ -1184,6 +1185,10 @@ run_test 52 "time out lock replay (3764)" # bug 3462 - simultaneous MDC requests test_53a() { + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + cancel_lru_locks mdc # cleanup locks from former test cases mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed" mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed" @@ -1288,6 +1293,10 @@ test_53c() { run_test 53c "|X| open request and close request while two MDC requests in flight" test_53d() { + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + cancel_lru_locks mdc # cleanup locks from former test cases mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed" @@ -2025,17 +2034,12 @@ check_for_process () { killall_process $clients "$prog" -0 } -killall_process () { - local clients=${1:-$(hostname)} - local name=$2 - local signal=$3 - local rc=0 - - do_nodes $clients "killall $signal $name" -} - test_70b () { local clients=${CLIENTS:-$HOSTNAME} + local mdscount=$MDSCOUNT + + # until LU-6844 is fixed, run on one MDT instead of disabling test + mdscount=1 zconf_mount_clients $clients $MOUNT @@ -2048,9 +2052,9 @@ test_70b () { local start_ts=$(date +%s) local cmd="rundbench 1 -t $duration" local pid="" - if [ $MDSCOUNT -ge 2 ]; then - test_mkdir -p -c$MDSCOUNT $DIR/$tdir - $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir + if [ $mdscount -ge 2 ]; then + test_mkdir -p -c$mdscount $DIR/$tdir + $LFS setdirstripe -D -c$mdscount $DIR/$tdir fi do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \ PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \ @@ -2087,7 +2091,7 @@ test_70b () { log "$TESTNAME fail mds$fail_index $num_failovers times" fail mds$fail_index elapsed=$(($(date +%s) - start_ts)) - if [ $fail_index -ge $MDSCOUNT ]; then + if [ $fail_index -ge $mdscount ]; then fail_index=1 else fail_index=$((fail_index+1)) @@ -2099,6 +2103,287 @@ test_70b () { run_test 70b "dbench ${MDSCOUNT}mdts recovery; $CLIENTCOUNT clients" # end multi-client tests +random_fail_mdt() { + local max_index=$1 + local duration=$2 + local monitor_pid=$3 + local elapsed + local start_ts=$(date +%s) + local num_failovers=0 + local fail_index + + elapsed=$(($(date +%s) - start_ts)) + while [ $elapsed -lt $duration ]; do + fail_index=$((RANDOM%max_index+1)) + kill -0 $monitor_pid || + error "$monitor_pid stopped" + sleep 120 + replay_barrier mds$fail_index + sleep 10 + # Increment the number of failovers + num_failovers=$((num_failovers+1)) + log "$TESTNAME fail mds$fail_index $num_failovers times" + fail mds$fail_index + elapsed=$(($(date +%s) - start_ts)) + done +} + +cleanup_70c() { + trap 0 + rm -f $DIR/replay-single.70c.lck + rm -rf /$DIR/$tdir +} + +test_70c () { + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=600 + + local elapsed + local start_ts=$(date +%s) + + trap cleanup_70c EXIT + ( + while [ ! -e $DIR/replay-single.70c.lck ]; do + test_mkdir -p -c$MDSCOUNT $DIR/$tdir || break + if [ $MDSCOUNT -ge 2 ]; then + $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir || + error "set default dirstripe failed" + fi + cd $DIR/$tdir || break + tar cf - /etc | tar xf - || error "tar failed in loop" + done + )& + tar_70c_pid=$! + echo "Started tar $tar_70c_pid" + + random_fail_mdt $MDSCOUNT $duration $tar_70c_pid + kill -0 $tar_70c_pid || error "tar $tar_70c_pid stopped" + + touch $DIR/replay-single.70c.lck + wait $tar_70c_pid || error "$?: tar failed" + + cleanup_70c + true +} +run_test 70c "tar ${MDSCOUNT}mdts recovery" + +cleanup_70d() { + trap 0 + kill -9 $mkdir_70d_pid +} + +test_70d () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + mkdir -p $DIR/$tdir + + local elapsed + local start_ts=$(date +%s) + + trap cleanup_70d EXIT + ( + while true; do + $LFS mkdir -i0 -c2 $DIR/$tdir/test || { + echo "mkdir fails" + break + } + $LFS mkdir -i1 -c2 $DIR/$tdir/test1 || { + echo "mkdir fails" + break + } + + touch $DIR/$tdir/test/a || { + echo "touch fails" + break; + } + mkdir $DIR/$tdir/test/b || { + echo "mkdir fails" + break; + } + rm -rf $DIR/$tdir/test || { + echo "rmdir fails" + break + } + + touch $DIR/$tdir/test1/a || { + echo "touch fails" + break; + } + mkdir $DIR/$tdir/test1/b || { + echo "mkdir fails" + break; + } + + rm -rf $DIR/$tdir/test1 || { + echo "rmdir fails" + break + } + done + )& + mkdir_70d_pid=$! + echo "Started $mkdir_70d_pid" + + random_fail_mdt $MDSCOUNT $duration $mkdir_70d_pid + kill -0 $mkdir_70d_pid || error "mkdir/rmdir $mkdir_70d_pid stopped" + + cleanup_70d + true +} +run_test 70d "mkdir/rmdir striped dir ${MDSCOUNT}mdts recovery" + +cleanup_70e() { + trap 0 + kill -9 $rename_70e_pid +} + +test_70e () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + echo ha > /proc/sys/lnet/debug + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + mkdir -p $DIR/$tdir + $LFS mkdir -i0 $DIR/$tdir/test_0 + $LFS mkdir -i0 $DIR/$tdir/test_1 + touch $DIR/$tdir/test_0/a + touch $DIR/$tdir/test_1/b + trap cleanup_70e EXIT + ( + while true; do + mrename $DIR/$tdir/test_0/a $DIR/$tdir/test_1/b > \ + /dev/null || { + echo "a->b fails" + break; + } + + checkstat $DIR/$tdir/test_0/a && { + echo "a still exists" + break + } + + checkstat $DIR/$tdir/test_1/b || { + echo "b still exists" + break + } + + touch $DIR/$tdir/test_0/a || { + echo "touch a fails" + break + } + + mrename $DIR/$tdir/test_1/b $DIR/$tdir/test_0/a > \ + /dev/null || { + echo "a->a fails" + break; + } + done + )& + rename_70e_pid=$! + echo "Started $rename_70e_pid" + + random_fail_mdt 2 $duration $rename_70e_pid + kill -0 $rename_70e_pid || error "rename $rename_70e_pid stopped" + + cleanup_70e + true +} +run_test 70e "rename cross-MDT with random fails" + +cleanup_71a() { + trap 0 + kill -9 $mkdir_71a_pid +} + +random_double_fail_mdt() { + local max_index=$1 + local duration=$2 + local monitor_pid=$3 + local elapsed + local start_ts=$(date +%s) + local num_failovers=0 + local fail_index + local second_index + + elapsed=$(($(date +%s) - start_ts)) + while [ $elapsed -lt $duration ]; do + fail_index=$((RANDOM%max_index + 1)) + if [ $fail_index -eq $max_index ]; then + second_index=1 + else + second_index=$((fail_index + 1)) + fi + kill -0 $monitor_pid || + error "$monitor_pid stopped" + sleep 120 + replay_barrier mds$fail_index + replay_barrier mds$second_index + sleep 10 + # Increment the number of failovers + num_failovers=$((num_failovers+1)) + log "fail mds$fail_index mds$second_index $num_failovers times" + fail mds${fail_index},mds${second_index} + elapsed=$(($(date +%s) - start_ts)) + done +} + +test_71a () { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + local clients=${CLIENTS:-$HOSTNAME} + local rc=0 + + zconf_mount_clients $clients $MOUNT + + local duration=300 + [ "$SLOW" = "no" ] && duration=180 + # set duration to 900 because it takes some time to boot node + [ "$FAILURE_MODE" = HARD ] && duration=900 + + mkdir -p $DIR/$tdir + + local elapsed + local start_ts=$(date +%s) + + trap cleanup_71a EXIT + ( + while true; do + $LFS mkdir -i0 -c2 $DIR/$tdir/test + rmdir $DIR/$tdir/test + done + )& + mkdir_71a_pid=$! + echo "Started $mkdir_71a_pid" + + random_double_fail_mdt 2 $duration $mkdir_71a_pid + kill -0 $mkdir_71a_pid || error "mkdir/rmdir $mkdir_71a_pid stopped" + + cleanup_71a + true +} +run_test 71a "mkdir/rmdir striped dir with 2 mdts recovery" + test_73a() { multiop_bg_pause $DIR/$tfile O_tSc || error "multiop_bg_pause $DIR/$tfile failed" @@ -3095,6 +3380,241 @@ test_101() { #LU-5648 } run_test 101 "Shouldn't reassign precreated objs to other files after recovery" +test_102a() { + local idx + local facet + local num + local i + local pids pid + + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + + $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed" + idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir)) + facet="mds$((0x$idx + 1))" + + # get current value of max_mod_rcps_in_flight + num=$($LCTL get_param -n \ + mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight) + # set default value if client does not support multi mod RPCs + [ -z "$num" ] && num=1 + + echo "creating $num files ..." + umask 0022 + for i in $(seq $num); do + touch $DIR/$tdir/file-$i + done + + # drop request on MDT to force resend + #define OBD_FAIL_MDS_REINT_MULTI_NET 0x159 + do_facet $facet "$LCTL set_param fail_loc=0x159" + echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..." + for i in $(seq $num); do + chmod 0600 $DIR/$tdir/file-$i & + pids="$pids $!" + done + sleep 1 + do_facet $facet "$LCTL set_param fail_loc=0" + for pid in $pids; do + wait $pid || error "chmod failed" + done + echo "done ($(date +%H:%M:%S))" + + # check chmod succeed + for i in $(seq $num); do + checkstat -vp 0600 $DIR/$tdir/file-$i + done + + rm -rf $DIR/$tdir +} +run_test 102a "check resend (request lost) with multiple modify RPCs in flight" + +test_102b() { + local idx + local facet + local num + local i + local pids pid + + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + + $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed" + idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir)) + facet="mds$((0x$idx + 1))" + + # get current value of max_mod_rcps_in_flight + num=$($LCTL get_param -n \ + mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight) + # set default value if client does not support multi mod RPCs + [ -z "$num" ] && num=1 + + echo "creating $num files ..." + umask 0022 + for i in $(seq $num); do + touch $DIR/$tdir/file-$i + done + + # drop reply on MDT to force reconstruction + #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a + do_facet $facet "$LCTL set_param fail_loc=0x15a" + echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..." + for i in $(seq $num); do + chmod 0600 $DIR/$tdir/file-$i & + pids="$pids $!" + done + sleep 1 + do_facet $facet "$LCTL set_param fail_loc=0" + for pid in $pids; do + wait $pid || error "chmod failed" + done + echo "done ($(date +%H:%M:%S))" + + # check chmod succeed + for i in $(seq $num); do + checkstat -vp 0600 $DIR/$tdir/file-$i + done + + rm -rf $DIR/$tdir +} +run_test 102b "check resend (reply lost) with multiple modify RPCs in flight" + +test_102c() { + local idx + local facet + local num + local i + local pids pid + + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + + $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed" + idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir)) + facet="mds$((0x$idx + 1))" + + # get current value of max_mod_rcps_in_flight + num=$($LCTL get_param -n \ + mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight) + # set default value if client does not support multi mod RPCs + [ -z "$num" ] && num=1 + + echo "creating $num files ..." + umask 0022 + for i in $(seq $num); do + touch $DIR/$tdir/file-$i + done + + replay_barrier $facet + + # drop reply on MDT + #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a + do_facet $facet "$LCTL set_param fail_loc=0x15a" + echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..." + for i in $(seq $num); do + chmod 0600 $DIR/$tdir/file-$i & + pids="$pids $!" + done + sleep 1 + do_facet $facet "$LCTL set_param fail_loc=0" + + # fail MDT + fail $facet + + for pid in $pids; do + wait $pid || error "chmod failed" + done + echo "done ($(date +%H:%M:%S))" + + # check chmod succeed + for i in $(seq $num); do + checkstat -vp 0600 $DIR/$tdir/file-$i + done + + rm -rf $DIR/$tdir +} +run_test 102c "check replay w/o reconstruction with multiple mod RPCs in flight" + +test_102d() { + local idx + local facet + local num + local i + local pids pid + + [[ $(lctl get_param mdc.*.import | + grep "connect_flags:.*multi_mod_rpc") ]] || + { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; } + + $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed" + idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir)) + facet="mds$((0x$idx + 1))" + + # get current value of max_mod_rcps_in_flight + num=$($LCTL get_param -n \ + mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight) + # set default value if client does not support multi mod RPCs + [ -z "$num" ] && num=1 + + echo "creating $num files ..." + umask 0022 + for i in $(seq $num); do + touch $DIR/$tdir/file-$i + done + + # drop reply on MDT + #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a + do_facet $facet "$LCTL set_param fail_loc=0x15a" + echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..." + for i in $(seq $num); do + chmod 0600 $DIR/$tdir/file-$i & + pids="$pids $!" + done + sleep 1 + + # write MDT transactions to disk + do_facet $facet "sync; sync; sync" + + do_facet $facet "$LCTL set_param fail_loc=0" + + # fail MDT + fail $facet + + for pid in $pids; do + wait $pid || error "chmod failed" + done + echo "done ($(date +%H:%M:%S))" + + # check chmod succeed + for i in $(seq $num); do + checkstat -vp 0600 $DIR/$tdir/file-$i + done + + rm -rf $DIR/$tdir +} +run_test 102d "check replay & reconstruction with multiple mod RPCs in flight" + +test_103() { + remote_mds_nodsh && skip "remote MDS with nodsh" && return +#define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162 + do_facet mds1 $LCTL set_param fail_loc=0x80000162 + + mkdir -p $DIR/$tdir + createmany -o $DIR/$tdir/t- 30 || + error "create files on remote directory failed" + sync + rm -rf $DIR/$tdir/t-* + sync +#MDS should crash with tr->otr_next_id overflow + fail mds1 +} +run_test 103 "Check otr_next_id overflow" + + check_striped_dir_110() { $CHECKSTAT -t dir $DIR/$tdir/striped_dir || @@ -3749,6 +4269,206 @@ test_115() { } run_test 115 "failover for create/unlink striped directory" +test_116a() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.55) ] && + skip "Do not support large update log before 2.7.55" && + return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + local fail_index=0 + + mkdir -p $DIR/$tdir + replay_barrier mds1 + + # OBD_FAIL_SPLIT_UPDATE_REC 0x1702 + do_facet mds1 "lctl set_param fail_loc=0x80001702" + $LFS setdirstripe -c$MDSCOUNT $DIR/$tdir/striped_dir + + fail mds1 + $CHECKSTAT -t dir $DIR/$tdir/striped_dir || + error "stried_dir does not exists" +} +run_test 116a "large update log master MDT recovery" + +test_116b() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.55) ] && + skip "Do not support large update log before 2.7.55" && + return 0 + + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + local fail_index=0 + + mkdir -p $DIR/$tdir + replay_barrier mds2 + + # OBD_FAIL_SPLIT_UPDATE_REC 0x1702 + do_facet mds2 "lctl set_param fail_loc=0x80001702" + $LFS setdirstripe -c$MDSCOUNT $DIR/$tdir/striped_dir + + fail mds2 + $CHECKSTAT -t dir $DIR/$tdir/striped_dir || + error "stried_dir does not exists" +} +run_test 116b "large update log slave MDT recovery" + +test_117() { + [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0 + ([ $FAILURE_MODE == "HARD" ] && + [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) && + skip "MDTs needs to be on diff hosts for HARD fail mode" && + return 0 + local index + local mds_indexs + + mkdir -p $DIR/$tdir + $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir + $LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1 + sleep 2 + + # Let's set rdonly on all MDTs, so client will send + # replay requests on all MDTs and replay these requests + # at the same time. This test will verify the recovery + # will not be deadlock in this case, LU-7531. + for ((index = 0; index < $((MDSCOUNT)); index++)); do + replay_barrier mds$((index + 1)) + if [ -z $mds_indexs ]; then + mds_indexs="${mds_indexs}mds$((index+1))" + else + mds_indexs="${mds_indexs},mds$((index+1))" + fi + done + + rm -rf $DIR/$tdir/remote_dir + rm -rf $DIR/$tdir/remote_dir_1 + + fail $mds_indexs + + rm -rf $DIR/$tdir || error "rmdir failed" +} +run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2" + +test_118() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] && + skip "Do not support large update log before 2.7.64" && + return 0 + + mkdir -p $DIR/$tdir + + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir || + error "setdirstripe fails" + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 || + error "setdirstripe fails 1" + rm -rf $DIR/$tdir/striped_dir* || error "rmdir fails" + + # OBD_FAIL_INVALIDATE_UPDATE 0x1705 + do_facet mds1 "lctl set_param fail_loc=0x1705" + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 + do_facet mds1 "lctl set_param fail_loc=0x0" + + replay_barrier mds1 + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir + $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 + fail mds1 + + true +} +run_test 118 "invalidate osp update will not cause update log corruption" + +test_119() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] && + skip "Do not support large update log before 2.7.64" && + return 0 + local stripe_count + local hard_timeout=$(do_facet mds1 \ + "lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard") + + local clients=${CLIENTS:-$HOSTNAME} + local time_min=$(recovery_time_min) + + mkdir -p $DIR/$tdir + mkdir $DIR/$tdir/tmp + rmdir $DIR/$tdir/tmp + + replay_barrier mds1 + mkdir $DIR/$tdir/dir_1 + for ((i = 0; i < 20; i++)); do + $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i + done + + stop mds1 + change_active mds1 + wait_for_facet mds1 + + #define OBD_FAIL_TGT_REPLAY_DELAY 0x714 + do_facet mds1 $LCTL set_param fail_loc=0x80000714 + #sleep (timeout + 5), so mds will evict the client exports, + #but DNE update recovery will keep going. + do_facet mds1 $LCTL set_param fail_val=$((time_min + 5)) + + mount_facet mds1 "-o recovery_time_hard=$time_min" + + wait_clients_import_state "$clients" mds1 FULL + + clients_up || clients_up || error "failover df: $?" + + #revert back the hard timeout + do_facet mds1 $LCTL set_param \ + mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout + + for ((i = 0; i < 20; i++)); do + stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i) + [ $stripe_count == 2 ] || { + error "stripe_dir-$i creation replay fails" + break + } + done +} +run_test 119 "timeout of normal replay does not cause DNE replay fails " + +test_120() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] && + skip "Do not support large update log before 2.7.64" && + return 0 + + mkdir $DIR/$tdir + replay_barrier_nosync mds1 + for ((i = 0; i < 20; i++)); do + mkdir $DIR/$tdir/dir-$i || { + error "create dir-$i fails" + break + } + $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i || { + error "create stripe_dir-$i fails" + break + } + done + + fail_abort mds1 + + for ((i = 0; i < 20; i++)); do + [ ! -e "$DIR/$tdir/dir-$i" ] || { + error "dir-$i still exists" + break + } + [ ! -e "$DIR/$tdir/stripe_dir-$i" ] || { + error "stripe_dir-$i still exists" + break + } + done +} +run_test 120 "DNE fail abort should stop both normal and DNE replay" + complete $SECONDS check_and_cleanup_lustre exit_status