require_dsh_mds || exit 0
# Skip these tests
-# bug number for skipped tests: b=17466/LU-472
-ALWAYS_EXCEPT=" 61d $REPLAY_SINGLE_EXCEPT"
+# bug number for skipped tests:
+# LU-472
+ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT 61d"
# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
case "$(lsb_release -sr)" in # only disable tests for el7
;;
esac
-# 63 min 7 min AT AT AT AT"
-[ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a 44b 65 66 67 68"
+# 7.5 (min)"
+[ "$SLOW" = "no" ] && EXCEPT_SLOW="44b"
[ $(facet_fstype $SINGLEMDS) = "zfs" ] &&
# bug number for skipped test: LU-1867 LU-3127
lctl get_param mdc.*.connect_flags | grep -q layout_lock &&
skip "layout_lock needs MDS connection for IO" && return 0
- $LCTL mark multiop $MOUNT/$tfile OS_c
+ $LCTL mark "$HOSTNAME multiop $MOUNT/$tfile OS_c"
multiop $MOUNT/$tfile OS_c &
PID=$!
writeme -s $MOUNT/${tfile}-2 &
# bug 3462 - simultaneous MDC requests
test_53a() {
+ [[ $(lctl get_param mdc.*.import |
+ grep "connect_flags:.*multi_mod_rpc") ]] ||
+ { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
+
cancel_lru_locks mdc # cleanup locks from former test cases
mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
run_test 53c "|X| open request and close request while two MDC requests in flight"
test_53d() {
+ [[ $(lctl get_param mdc.*.import |
+ grep "connect_flags:.*multi_mod_rpc") ]] ||
+ { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
+
cancel_lru_locks mdc # cleanup locks from former test cases
mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
killall_process $clients "$prog" -0
}
-killall_process () {
- local clients=${1:-$(hostname)}
- local name=$2
- local signal=$3
- local rc=0
-
- do_nodes $clients "killall $signal $name"
-}
-
test_70b () {
local clients=${CLIENTS:-$HOSTNAME}
+ local mdscount=$MDSCOUNT
+
+ # until LU-6844 is fixed, run on one MDT instead of disabling test
+ mdscount=1
zconf_mount_clients $clients $MOUNT
local start_ts=$(date +%s)
local cmd="rundbench 1 -t $duration"
local pid=""
- if [ $MDSCOUNT -ge 2 ]; then
- test_mkdir -p -c$MDSCOUNT $DIR/$tdir
- $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir
+ if [ $mdscount -ge 2 ]; then
+ test_mkdir -p -c$mdscount $DIR/$tdir
+ $LFS setdirstripe -D -c$mdscount $DIR/$tdir
fi
do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \
log "$TESTNAME fail mds$fail_index $num_failovers times"
fail mds$fail_index
elapsed=$(($(date +%s) - start_ts))
- if [ $fail_index -ge $MDSCOUNT ]; then
+ if [ $fail_index -ge $mdscount ]; then
fail_index=1
else
fail_index=$((fail_index+1))
run_test 70b "dbench ${MDSCOUNT}mdts recovery; $CLIENTCOUNT clients"
# end multi-client tests
+random_fail_mdt() {
+ local max_index=$1
+ local duration=$2
+ local monitor_pid=$3
+ local elapsed
+ local start_ts=$(date +%s)
+ local num_failovers=0
+ local fail_index
+
+ elapsed=$(($(date +%s) - start_ts))
+ while [ $elapsed -lt $duration ]; do
+ fail_index=$((RANDOM%max_index+1))
+ kill -0 $monitor_pid ||
+ error "$monitor_pid stopped"
+ sleep 120
+ replay_barrier mds$fail_index
+ sleep 10
+ # Increment the number of failovers
+ num_failovers=$((num_failovers+1))
+ log "$TESTNAME fail mds$fail_index $num_failovers times"
+ fail mds$fail_index
+ elapsed=$(($(date +%s) - start_ts))
+ done
+}
+
+cleanup_70c() {
+ trap 0
+ rm -f $DIR/replay-single.70c.lck
+ rm -rf /$DIR/$tdir
+}
+
+test_70c () {
+ local clients=${CLIENTS:-$HOSTNAME}
+ local rc=0
+
+ zconf_mount_clients $clients $MOUNT
+
+ local duration=300
+ [ "$SLOW" = "no" ] && duration=180
+ # set duration to 900 because it takes some time to boot node
+ [ "$FAILURE_MODE" = HARD ] && duration=600
+
+ local elapsed
+ local start_ts=$(date +%s)
+
+ trap cleanup_70c EXIT
+ (
+ while [ ! -e $DIR/replay-single.70c.lck ]; do
+ test_mkdir -p -c$MDSCOUNT $DIR/$tdir || break
+ if [ $MDSCOUNT -ge 2 ]; then
+ $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir ||
+ error "set default dirstripe failed"
+ fi
+ cd $DIR/$tdir || break
+ tar cf - /etc | tar xf - || error "tar failed in loop"
+ done
+ )&
+ tar_70c_pid=$!
+ echo "Started tar $tar_70c_pid"
+
+ random_fail_mdt $MDSCOUNT $duration $tar_70c_pid
+ kill -0 $tar_70c_pid || error "tar $tar_70c_pid stopped"
+
+ touch $DIR/replay-single.70c.lck
+ wait $tar_70c_pid || error "$?: tar failed"
+
+ cleanup_70c
+ true
+}
+run_test 70c "tar ${MDSCOUNT}mdts recovery"
+
+cleanup_70d() {
+ trap 0
+ kill -9 $mkdir_70d_pid
+}
+
+test_70d () {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ local clients=${CLIENTS:-$HOSTNAME}
+ local rc=0
+
+ zconf_mount_clients $clients $MOUNT
+
+ local duration=300
+ [ "$SLOW" = "no" ] && duration=180
+ # set duration to 900 because it takes some time to boot node
+ [ "$FAILURE_MODE" = HARD ] && duration=900
+
+ mkdir -p $DIR/$tdir
+
+ local elapsed
+ local start_ts=$(date +%s)
+
+ trap cleanup_70d EXIT
+ (
+ while true; do
+ $LFS mkdir -i0 -c2 $DIR/$tdir/test || {
+ echo "mkdir fails"
+ break
+ }
+ $LFS mkdir -i1 -c2 $DIR/$tdir/test1 || {
+ echo "mkdir fails"
+ break
+ }
+
+ touch $DIR/$tdir/test/a || {
+ echo "touch fails"
+ break;
+ }
+ mkdir $DIR/$tdir/test/b || {
+ echo "mkdir fails"
+ break;
+ }
+ rm -rf $DIR/$tdir/test || {
+ echo "rmdir fails"
+ break
+ }
+
+ touch $DIR/$tdir/test1/a || {
+ echo "touch fails"
+ break;
+ }
+ mkdir $DIR/$tdir/test1/b || {
+ echo "mkdir fails"
+ break;
+ }
+
+ rm -rf $DIR/$tdir/test1 || {
+ echo "rmdir fails"
+ break
+ }
+ done
+ )&
+ mkdir_70d_pid=$!
+ echo "Started $mkdir_70d_pid"
+
+ random_fail_mdt $MDSCOUNT $duration $mkdir_70d_pid
+ kill -0 $mkdir_70d_pid || error "mkdir/rmdir $mkdir_70d_pid stopped"
+
+ cleanup_70d
+ true
+}
+run_test 70d "mkdir/rmdir striped dir ${MDSCOUNT}mdts recovery"
+
+cleanup_70e() {
+ trap 0
+ kill -9 $rename_70e_pid
+}
+
+test_70e () {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ local clients=${CLIENTS:-$HOSTNAME}
+ local rc=0
+
+ echo ha > /proc/sys/lnet/debug
+ zconf_mount_clients $clients $MOUNT
+
+ local duration=300
+ [ "$SLOW" = "no" ] && duration=180
+ # set duration to 900 because it takes some time to boot node
+ [ "$FAILURE_MODE" = HARD ] && duration=900
+
+ mkdir -p $DIR/$tdir
+ $LFS mkdir -i0 $DIR/$tdir/test_0
+ $LFS mkdir -i0 $DIR/$tdir/test_1
+ touch $DIR/$tdir/test_0/a
+ touch $DIR/$tdir/test_1/b
+ trap cleanup_70e EXIT
+ (
+ while true; do
+ mrename $DIR/$tdir/test_0/a $DIR/$tdir/test_1/b > \
+ /dev/null || {
+ echo "a->b fails"
+ break;
+ }
+
+ checkstat $DIR/$tdir/test_0/a && {
+ echo "a still exists"
+ break
+ }
+
+ checkstat $DIR/$tdir/test_1/b || {
+ echo "b still exists"
+ break
+ }
+
+ touch $DIR/$tdir/test_0/a || {
+ echo "touch a fails"
+ break
+ }
+
+ mrename $DIR/$tdir/test_1/b $DIR/$tdir/test_0/a > \
+ /dev/null || {
+ echo "a->a fails"
+ break;
+ }
+ done
+ )&
+ rename_70e_pid=$!
+ echo "Started $rename_70e_pid"
+
+ random_fail_mdt 2 $duration $rename_70e_pid
+ kill -0 $rename_70e_pid || error "rename $rename_70e_pid stopped"
+
+ cleanup_70e
+ true
+}
+run_test 70e "rename cross-MDT with random fails"
+
+cleanup_71a() {
+ trap 0
+ kill -9 $mkdir_71a_pid
+}
+
+random_double_fail_mdt() {
+ local max_index=$1
+ local duration=$2
+ local monitor_pid=$3
+ local elapsed
+ local start_ts=$(date +%s)
+ local num_failovers=0
+ local fail_index
+ local second_index
+
+ elapsed=$(($(date +%s) - start_ts))
+ while [ $elapsed -lt $duration ]; do
+ fail_index=$((RANDOM%max_index + 1))
+ if [ $fail_index -eq $max_index ]; then
+ second_index=1
+ else
+ second_index=$((fail_index + 1))
+ fi
+ kill -0 $monitor_pid ||
+ error "$monitor_pid stopped"
+ sleep 120
+ replay_barrier mds$fail_index
+ replay_barrier mds$second_index
+ sleep 10
+ # Increment the number of failovers
+ num_failovers=$((num_failovers+1))
+ log "fail mds$fail_index mds$second_index $num_failovers times"
+ fail mds${fail_index},mds${second_index}
+ elapsed=$(($(date +%s) - start_ts))
+ done
+}
+
+test_71a () {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ local clients=${CLIENTS:-$HOSTNAME}
+ local rc=0
+
+ zconf_mount_clients $clients $MOUNT
+
+ local duration=300
+ [ "$SLOW" = "no" ] && duration=180
+ # set duration to 900 because it takes some time to boot node
+ [ "$FAILURE_MODE" = HARD ] && duration=900
+
+ mkdir -p $DIR/$tdir
+
+ local elapsed
+ local start_ts=$(date +%s)
+
+ trap cleanup_71a EXIT
+ (
+ while true; do
+ $LFS mkdir -i0 -c2 $DIR/$tdir/test
+ rmdir $DIR/$tdir/test
+ done
+ )&
+ mkdir_71a_pid=$!
+ echo "Started $mkdir_71a_pid"
+
+ random_double_fail_mdt 2 $duration $mkdir_71a_pid
+ kill -0 $mkdir_71a_pid || error "mkdir/rmdir $mkdir_71a_pid stopped"
+
+ cleanup_71a
+ true
+}
+run_test 71a "mkdir/rmdir striped dir with 2 mdts recovery"
+
test_73a() {
multiop_bg_pause $DIR/$tfile O_tSc ||
error "multiop_bg_pause $DIR/$tfile failed"
}
run_test 101 "Shouldn't reassign precreated objs to other files after recovery"
+test_102a() {
+ local idx
+ local facet
+ local num
+ local i
+ local pids pid
+
+ [[ $(lctl get_param mdc.*.import |
+ grep "connect_flags:.*multi_mod_rpc") ]] ||
+ { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
+
+ $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+ idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
+ facet="mds$((0x$idx + 1))"
+
+ # get current value of max_mod_rcps_in_flight
+ num=$($LCTL get_param -n \
+ mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
+ # set default value if client does not support multi mod RPCs
+ [ -z "$num" ] && num=1
+
+ echo "creating $num files ..."
+ umask 0022
+ for i in $(seq $num); do
+ touch $DIR/$tdir/file-$i
+ done
+
+ # drop request on MDT to force resend
+ #define OBD_FAIL_MDS_REINT_MULTI_NET 0x159
+ do_facet $facet "$LCTL set_param fail_loc=0x159"
+ echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
+ for i in $(seq $num); do
+ chmod 0600 $DIR/$tdir/file-$i &
+ pids="$pids $!"
+ done
+ sleep 1
+ do_facet $facet "$LCTL set_param fail_loc=0"
+ for pid in $pids; do
+ wait $pid || error "chmod failed"
+ done
+ echo "done ($(date +%H:%M:%S))"
+
+ # check chmod succeed
+ for i in $(seq $num); do
+ checkstat -vp 0600 $DIR/$tdir/file-$i
+ done
+
+ rm -rf $DIR/$tdir
+}
+run_test 102a "check resend (request lost) with multiple modify RPCs in flight"
+
+test_102b() {
+ local idx
+ local facet
+ local num
+ local i
+ local pids pid
+
+ [[ $(lctl get_param mdc.*.import |
+ grep "connect_flags:.*multi_mod_rpc") ]] ||
+ { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
+
+ $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+ idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
+ facet="mds$((0x$idx + 1))"
+
+ # get current value of max_mod_rcps_in_flight
+ num=$($LCTL get_param -n \
+ mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
+ # set default value if client does not support multi mod RPCs
+ [ -z "$num" ] && num=1
+
+ echo "creating $num files ..."
+ umask 0022
+ for i in $(seq $num); do
+ touch $DIR/$tdir/file-$i
+ done
+
+ # drop reply on MDT to force reconstruction
+ #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
+ do_facet $facet "$LCTL set_param fail_loc=0x15a"
+ echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
+ for i in $(seq $num); do
+ chmod 0600 $DIR/$tdir/file-$i &
+ pids="$pids $!"
+ done
+ sleep 1
+ do_facet $facet "$LCTL set_param fail_loc=0"
+ for pid in $pids; do
+ wait $pid || error "chmod failed"
+ done
+ echo "done ($(date +%H:%M:%S))"
+
+ # check chmod succeed
+ for i in $(seq $num); do
+ checkstat -vp 0600 $DIR/$tdir/file-$i
+ done
+
+ rm -rf $DIR/$tdir
+}
+run_test 102b "check resend (reply lost) with multiple modify RPCs in flight"
+
+test_102c() {
+ local idx
+ local facet
+ local num
+ local i
+ local pids pid
+
+ [[ $(lctl get_param mdc.*.import |
+ grep "connect_flags:.*multi_mod_rpc") ]] ||
+ { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
+
+ $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+ idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
+ facet="mds$((0x$idx + 1))"
+
+ # get current value of max_mod_rcps_in_flight
+ num=$($LCTL get_param -n \
+ mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
+ # set default value if client does not support multi mod RPCs
+ [ -z "$num" ] && num=1
+
+ echo "creating $num files ..."
+ umask 0022
+ for i in $(seq $num); do
+ touch $DIR/$tdir/file-$i
+ done
+
+ replay_barrier $facet
+
+ # drop reply on MDT
+ #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
+ do_facet $facet "$LCTL set_param fail_loc=0x15a"
+ echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
+ for i in $(seq $num); do
+ chmod 0600 $DIR/$tdir/file-$i &
+ pids="$pids $!"
+ done
+ sleep 1
+ do_facet $facet "$LCTL set_param fail_loc=0"
+
+ # fail MDT
+ fail $facet
+
+ for pid in $pids; do
+ wait $pid || error "chmod failed"
+ done
+ echo "done ($(date +%H:%M:%S))"
+
+ # check chmod succeed
+ for i in $(seq $num); do
+ checkstat -vp 0600 $DIR/$tdir/file-$i
+ done
+
+ rm -rf $DIR/$tdir
+}
+run_test 102c "check replay w/o reconstruction with multiple mod RPCs in flight"
+
+test_102d() {
+ local idx
+ local facet
+ local num
+ local i
+ local pids pid
+
+ [[ $(lctl get_param mdc.*.import |
+ grep "connect_flags:.*multi_mod_rpc") ]] ||
+ { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
+
+ $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+ idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
+ facet="mds$((0x$idx + 1))"
+
+ # get current value of max_mod_rcps_in_flight
+ num=$($LCTL get_param -n \
+ mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
+ # set default value if client does not support multi mod RPCs
+ [ -z "$num" ] && num=1
+
+ echo "creating $num files ..."
+ umask 0022
+ for i in $(seq $num); do
+ touch $DIR/$tdir/file-$i
+ done
+
+ # drop reply on MDT
+ #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
+ do_facet $facet "$LCTL set_param fail_loc=0x15a"
+ echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
+ for i in $(seq $num); do
+ chmod 0600 $DIR/$tdir/file-$i &
+ pids="$pids $!"
+ done
+ sleep 1
+
+ # write MDT transactions to disk
+ do_facet $facet "sync; sync; sync"
+
+ do_facet $facet "$LCTL set_param fail_loc=0"
+
+ # fail MDT
+ fail $facet
+
+ for pid in $pids; do
+ wait $pid || error "chmod failed"
+ done
+ echo "done ($(date +%H:%M:%S))"
+
+ # check chmod succeed
+ for i in $(seq $num); do
+ checkstat -vp 0600 $DIR/$tdir/file-$i
+ done
+
+ rm -rf $DIR/$tdir
+}
+run_test 102d "check replay & reconstruction with multiple mod RPCs in flight"
+
+test_103() {
+ remote_mds_nodsh && skip "remote MDS with nodsh" && return
+#define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162
+ do_facet mds1 $LCTL set_param fail_loc=0x80000162
+
+ mkdir -p $DIR/$tdir
+ createmany -o $DIR/$tdir/t- 30 ||
+ error "create files on remote directory failed"
+ sync
+ rm -rf $DIR/$tdir/t-*
+ sync
+#MDS should crash with tr->otr_next_id overflow
+ fail mds1
+}
+run_test 103 "Check otr_next_id overflow"
+
+
check_striped_dir_110()
{
$CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
}
run_test 115 "failover for create/unlink striped directory"
+test_116a() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.55) ] &&
+ skip "Do not support large update log before 2.7.55" &&
+ return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local fail_index=0
+
+ mkdir -p $DIR/$tdir
+ replay_barrier mds1
+
+ # OBD_FAIL_SPLIT_UPDATE_REC 0x1702
+ do_facet mds1 "lctl set_param fail_loc=0x80001702"
+ $LFS setdirstripe -c$MDSCOUNT $DIR/$tdir/striped_dir
+
+ fail mds1
+ $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
+ error "stried_dir does not exists"
+}
+run_test 116a "large update log master MDT recovery"
+
+test_116b() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.55) ] &&
+ skip "Do not support large update log before 2.7.55" &&
+ return 0
+
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local fail_index=0
+
+ mkdir -p $DIR/$tdir
+ replay_barrier mds2
+
+ # OBD_FAIL_SPLIT_UPDATE_REC 0x1702
+ do_facet mds2 "lctl set_param fail_loc=0x80001702"
+ $LFS setdirstripe -c$MDSCOUNT $DIR/$tdir/striped_dir
+
+ fail mds2
+ $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
+ error "stried_dir does not exists"
+}
+run_test 116b "large update log slave MDT recovery"
+
+test_117() {
+ [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local index
+ local mds_indexs
+
+ mkdir -p $DIR/$tdir
+ $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir
+ $LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1
+ sleep 2
+
+ # Let's set rdonly on all MDTs, so client will send
+ # replay requests on all MDTs and replay these requests
+ # at the same time. This test will verify the recovery
+ # will not be deadlock in this case, LU-7531.
+ for ((index = 0; index < $((MDSCOUNT)); index++)); do
+ replay_barrier mds$((index + 1))
+ if [ -z $mds_indexs ]; then
+ mds_indexs="${mds_indexs}mds$((index+1))"
+ else
+ mds_indexs="${mds_indexs},mds$((index+1))"
+ fi
+ done
+
+ rm -rf $DIR/$tdir/remote_dir
+ rm -rf $DIR/$tdir/remote_dir_1
+
+ fail $mds_indexs
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+}
+run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2"
+
+test_118() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+ skip "Do not support large update log before 2.7.64" &&
+ return 0
+
+ mkdir -p $DIR/$tdir
+
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir ||
+ error "setdirstripe fails"
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 ||
+ error "setdirstripe fails 1"
+ rm -rf $DIR/$tdir/striped_dir* || error "rmdir fails"
+
+ # OBD_FAIL_INVALIDATE_UPDATE 0x1705
+ do_facet mds1 "lctl set_param fail_loc=0x1705"
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
+ do_facet mds1 "lctl set_param fail_loc=0x0"
+
+ replay_barrier mds1
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
+ fail mds1
+
+ true
+}
+run_test 118 "invalidate osp update will not cause update log corruption"
+
+test_119() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+ skip "Do not support large update log before 2.7.64" &&
+ return 0
+ local stripe_count
+ local hard_timeout=$(do_facet mds1 \
+ "lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard")
+
+ local clients=${CLIENTS:-$HOSTNAME}
+ local time_min=$(recovery_time_min)
+
+ mkdir -p $DIR/$tdir
+ mkdir $DIR/$tdir/tmp
+ rmdir $DIR/$tdir/tmp
+
+ replay_barrier mds1
+ mkdir $DIR/$tdir/dir_1
+ for ((i = 0; i < 20; i++)); do
+ $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i
+ done
+
+ stop mds1
+ change_active mds1
+ wait_for_facet mds1
+
+ #define OBD_FAIL_TGT_REPLAY_DELAY 0x714
+ do_facet mds1 $LCTL set_param fail_loc=0x80000714
+ #sleep (timeout + 5), so mds will evict the client exports,
+ #but DNE update recovery will keep going.
+ do_facet mds1 $LCTL set_param fail_val=$((time_min + 5))
+
+ mount_facet mds1 "-o recovery_time_hard=$time_min"
+
+ wait_clients_import_state "$clients" mds1 FULL
+
+ clients_up || clients_up || error "failover df: $?"
+
+ #revert back the hard timeout
+ do_facet mds1 $LCTL set_param \
+ mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout
+
+ for ((i = 0; i < 20; i++)); do
+ stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i)
+ [ $stripe_count == 2 ] || {
+ error "stripe_dir-$i creation replay fails"
+ break
+ }
+ done
+}
+run_test 119 "timeout of normal replay does not cause DNE replay fails "
+
+test_120() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
+ skip "Do not support large update log before 2.7.64" &&
+ return 0
+
+ mkdir $DIR/$tdir
+ replay_barrier_nosync mds1
+ for ((i = 0; i < 20; i++)); do
+ mkdir $DIR/$tdir/dir-$i || {
+ error "create dir-$i fails"
+ break
+ }
+ $LFS setdirstripe -c2 $DIR/$tdir/stripe_dir-$i || {
+ error "create stripe_dir-$i fails"
+ break
+ }
+ done
+
+ fail_abort mds1
+
+ for ((i = 0; i < 20; i++)); do
+ [ ! -e "$DIR/$tdir/dir-$i" ] || {
+ error "dir-$i still exists"
+ break
+ }
+ [ ! -e "$DIR/$tdir/stripe_dir-$i" ] || {
+ error "stripe_dir-$i still exists"
+ break
+ }
+ done
+}
+run_test 120 "DNE fail abort should stop both normal and DNE replay"
+
complete $SECONDS
check_and_cleanup_lustre
exit_status