+test_116a() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ "$MDS1_VERSION" -lt $(version_code 2.7.55) ] &&
+ skip "Do not support large update log before 2.7.55" &&
+ return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local fail_index=0
+
+ mkdir -p $DIR/$tdir
+ replay_barrier mds1
+
+ # OBD_FAIL_SPLIT_UPDATE_REC 0x1702
+ do_facet mds1 "lctl set_param fail_loc=0x80001702"
+ $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/striped_dir
+
+ fail mds1
+ $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
+ error "stried_dir does not exists"
+}
+run_test 116a "large update log master MDT recovery"
+
+test_116b() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ "$MDS1_VERSION" -lt $(version_code 2.7.55) ] &&
+ skip "Do not support large update log before 2.7.55" &&
+ return 0
+
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local fail_index=0
+
+ mkdir -p $DIR/$tdir
+ replay_barrier mds2
+
+ # OBD_FAIL_SPLIT_UPDATE_REC 0x1702
+ do_facet mds2 "lctl set_param fail_loc=0x80001702"
+ $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/striped_dir
+
+ fail mds2
+ $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
+ error "stried_dir does not exists"
+}
+run_test 116b "large update log slave MDT recovery"
+
+test_117() {
+ [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+ local index
+ local mds_indexs
+
+ mkdir -p $DIR/$tdir
+ $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir
+ $LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1
+ sleep 2
+
+ # Let's set rdonly on all MDTs, so client will send
+ # replay requests on all MDTs and replay these requests
+ # at the same time. This test will verify the recovery
+ # will not be deadlock in this case, LU-7531.
+ for ((index = 0; index < $((MDSCOUNT)); index++)); do
+ replay_barrier mds$((index + 1))
+ if [ -z $mds_indexs ]; then
+ mds_indexs="${mds_indexs}mds$((index+1))"
+ else
+ mds_indexs="${mds_indexs},mds$((index+1))"
+ fi
+ done
+
+ rm -rf $DIR/$tdir/remote_dir
+ rm -rf $DIR/$tdir/remote_dir_1
+
+ fail $mds_indexs
+
+ rm -rf $DIR/$tdir || error "rmdir failed"
+}
+run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2"
+
+test_118() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ "$MDS1_VERSION" -lt $(version_code 2.7.64) ] &&
+ skip "Do not support large update log before 2.7.64" &&
+ return 0
+
+ mkdir -p $DIR/$tdir
+
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir ||
+ error "setdirstripe fails"
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 ||
+ error "setdirstripe fails 1"
+ rm -rf $DIR/$tdir/striped_dir* || error "rmdir fails"
+
+ # OBD_FAIL_INVALIDATE_UPDATE 0x1705
+ do_facet mds1 "lctl set_param fail_loc=0x1705"
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
+ do_facet mds1 "lctl set_param fail_loc=0x0"
+
+ replay_barrier mds1
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
+ $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
+ fail mds1
+
+ true
+}
+run_test 118 "invalidate osp update will not cause update log corruption"
+
+test_119() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ "$MDS1_VERSION" -lt $(version_code 2.7.64) ] &&
+ skip "Do not support large update log before 2.7.64" &&
+ return 0
+ local stripe_count
+ local hard_timeout=$(do_facet mds1 \
+ "lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard")
+
+ local clients=${CLIENTS:-$HOSTNAME}
+ local time_min=$(recovery_time_min)
+
+ mkdir -p $DIR/$tdir
+ mkdir $DIR/$tdir/tmp
+ rmdir $DIR/$tdir/tmp
+
+ replay_barrier mds1
+ mkdir $DIR/$tdir/dir_1
+ for ((i = 0; i < 20; i++)); do
+ $LFS setdirstripe -i0 -c2 $DIR/$tdir/stripe_dir-$i
+ done
+
+ stop mds1
+ change_active mds1
+ wait_for_facet mds1
+
+ #define OBD_FAIL_TGT_REPLAY_DELAY 0x714
+ do_facet mds1 $LCTL set_param fail_loc=0x80000714
+ #sleep (timeout + 5), so mds will evict the client exports,
+ #but DNE update recovery will keep going.
+ do_facet mds1 $LCTL set_param fail_val=$((time_min + 5))
+
+ mount_facet mds1 "-o recovery_time_hard=$time_min"
+
+ wait_clients_import_state "$clients" mds1 FULL
+
+ clients_up || clients_up || error "failover df: $?"
+
+ #revert back the hard timeout
+ do_facet mds1 $LCTL set_param \
+ mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout
+
+ for ((i = 0; i < 20; i++)); do
+ stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i)
+ [ $stripe_count == 2 ] || {
+ error "stripe_dir-$i creation replay fails"
+ break
+ }
+ done
+}
+run_test 119 "timeout of normal replay does not cause DNE replay fails "
+
+test_120() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ [ "$MDS1_VERSION" -lt $(version_code 2.7.64) ] &&
+ skip "Do not support large update log before 2.7.64" &&
+ return 0
+
+ mkdir $DIR/$tdir
+ replay_barrier_nosync mds1
+ for ((i = 0; i < 20; i++)); do
+ mkdir $DIR/$tdir/dir-$i || {
+ error "create dir-$i fails"
+ break
+ }
+ $LFS setdirstripe -i0 -c2 $DIR/$tdir/stripe_dir-$i || {
+ error "create stripe_dir-$i fails"
+ break
+ }
+ done
+
+ fail_abort mds1
+
+ for ((i = 0; i < 20; i++)); do
+ [ ! -e "$DIR/$tdir/dir-$i" ] || {
+ error "dir-$i still exists"
+ break
+ }
+ [ ! -e "$DIR/$tdir/stripe_dir-$i" ] || {
+ error "stripe_dir-$i still exists"
+ break
+ }
+ done
+}
+run_test 120 "DNE fail abort should stop both normal and DNE replay"
+
+test_121() {
+ [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
+ skip "Don't support it before 2.11" &&
+ return 0
+
+ local at_max_saved=$(at_max_get mds)
+
+ touch $DIR/$tfile || error "touch $DIR/$tfile failed"
+ cancel_lru_locks mdc
+
+ multiop_bg_pause $DIR/$tfile s_s || error "multiop $DIR/$tfile failed"
+ mpid=$!
+
+ lctl set_param -n ldlm.cancel_unused_locks_before_replay "0"
+
+ stop mds1
+ change_active mds1
+ wait_for_facet mds1
+
+ #define OBD_FAIL_TGT_RECOVERY_REQ_RACE 0x721
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x721 fail_val=0"
+ at_max_set 0 mds
+
+ mount_facet mds1
+ wait_clients_import_state "$clients" mds1 FULL
+ clients_up || clients_up || error "failover df: $?"
+
+ kill -USR1 $mpid
+ wait $mpid || error "multiop_bg_pause pid failed"
+
+ do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
+ lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
+ at_max_set $at_max_saved mds
+ rm -f $DIR/$tfile
+}
+run_test 121 "lock replay timed out and race"
+
+test_130a() {
+ [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
+ skip "Do not support Data-on-MDT before 2.11"
+
+ replay_barrier $SINGLEMDS
+ $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile
+ fail $SINGLEMDS
+
+ [ $($LFS getstripe -L $DIR/$tfile) == "mdt" ] ||
+ error "Fail to replay DoM file creation"
+}
+run_test 130a "DoM file create (setstripe) replay"
+
+test_130b() {
+ [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
+ skip "Do not support Data-on-MDT before 2.11"
+
+ mkdir $DIR/$tdir
+ $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tdir
+ replay_barrier $SINGLEMDS
+ touch $DIR/$tdir/$tfile
+ fail $SINGLEMDS
+
+ [ $($LFS getstripe -L $DIR/$tdir/$tfile) == "mdt" ] ||
+ error "Fail to replay DoM file creation"
+}
+run_test 130b "DoM file create (inherited) replay"
+
+test_131a() {
+ [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
+ skip "Do not support Data-on-MDT before 2.11"
+
+ $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile
+ replay_barrier $SINGLEMDS
+ echo "dom_data" | dd of=$DIR/$tfile bs=8 count=1
+ # lock is not canceled and will be replayed
+ fail $SINGLEMDS
+
+ [ $(cat $DIR/$tfile) == "dom_data" ] ||
+ error "Wrong file content after failover"
+}
+run_test 131a "DoM file write lock replay"
+
+test_131b() {
+ [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
+ skip "Do not support Data-on-MDT before 2.11"
+
+ $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile
+ replay_barrier $SINGLEMDS
+ echo "dom_data" | dd of=$DIR/$tfile bs=8 count=1
+ cancel_lru_locks mdc
+
+ fail $SINGLEMDS
+
+ [ $(cat $DIR/$tfile) == "dom_data" ] ||
+ error "Wrong file content after failover"
+}
+run_test 131b "DoM file write replay"
+
+test_132a() {
+ [ "$MDS1_VERSION" -lt $(version_code 2.12.0) ] &&
+ skip "Need MDS version 2.12.0 or later"
+
+ $LFS setstripe -E 1M -c 1 -E EOF -c 2 $DIR/$tfile
+ replay_barrier $SINGLEMDS
+ # write over the first component size cause next component instantiation
+ dd if=/dev/urandom of=$DIR/$tfile bs=1M count=1 seek=1 ||
+ error "dd to $DIR/$tfile failed"
+ lfs getstripe $DIR/$tfile
+
+ cksum=$(md5sum $DIR/$tfile | awk '{print $1}')
+ $LFS getstripe -I2 $DIR/$tfile | grep -q lmm_objects ||
+ error "Component #1 was not instantiated"
+
+ fail $SINGLEMDS
+
+ lfs getstripe $DIR/$tfile
+ $LFS getstripe -I2 $DIR/$tfile | grep -q lmm_objects ||
+ error "Component #1 instantiation was not replayed"
+ cksum2=$(md5sum $DIR/$tfile | awk '{print $1}')
+ if [ $cksum != $cksum2 ] ; then
+ error_noexit "New cksum $cksum2 does not match original $cksum"
+ fi
+}
+run_test 132a "PFL new component instantiate replay"
+
+test_133() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
+ ([ $FAILURE_MODE == "HARD" ] &&
+ [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
+ skip "MDTs needs to be on diff hosts for HARD fail mode" &&
+ return 0
+
+ local remote_dir=$DIR/$tdir/remote_dir
+
+ mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed"
+ $LFS mkdir -i 1 $remote_dir
+
+ umount $MOUNT
+ do_facet mds2 $LCTL set_param seq.srv*MDT0001.space=clear
+
+ zconf_mount $(hostname) $MOUNT
+ client_up || return 1
+
+ #define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123
+ # SEQ_QUERY = 700
+ do_facet mds1 $LCTL set_param fail_val=700 fail_loc=0x80000123
+ cp /etc/hosts $remote_dir/file &
+ local pid=$!
+ sleep 1
+
+ fail_nodf mds1
+
+ wait $pid || error "cp failed"
+ rm -rf $DIR/$tdir || error "rmdir failed"
+
+ return 0
+}
+run_test 133 "check resend of ongoing requests for lwp during failover"
+
+test_134() {
+ [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return 0
+
+ pool_add pool_134
+ pool_add_targets pool_134 1 1
+
+ mkdir -p $DIR/$tdir/{A,B}
+ $LFS setstripe -p pool_134 $DIR/$tdir/A
+ $LFS setstripe -E EOF -p pool_134 $DIR/$tdir/B
+
+ replay_barrier mds1
+
+ touch $DIR/$tdir/A/$tfile || error "touch non-pfl file failed"
+ touch $DIR/$tdir/B/$tfile || error "touch pfl failed"
+
+ fail mds1
+
+ [ -f $DIR/$tdir/A/$tfile ] || error "non-pfl file does not exist"
+ [ -f $DIR/$tdir/B/$tfile ] || error "pfl file does not exist"
+}
+run_test 134 "replay creation of a file created in a pool"
+